importcontextlibimportloggingimporttempfileimporturllib.parsefromtypingimportOptional,TYPE_CHECKINGfromkaniimportChatMessage,ChatRole,ai_functionfromkani.enginesimportBaseEnginetry:importhttpximportpymupdfimportpymupdf4llmfromplaywright.async_apiimport(BrowserContext,TimeoutErrorasPlaywrightTimeoutError,async_playwright,ErrorasPlaywrightError,)exceptImportError:raiseImportError("You are missing required dependencies to use the bundled tools. Please install ReDel using `pip install"' "redel[bundled]"`.')fromNonefromredel.toolsimportToolBasefrom.webutilsimportget_google_links,web_markdownify,web_summarizeifTYPE_CHECKING:fromplaywright.async_apiimportPagelog=logging.getLogger(__name__)
[docs]classBrowsing(ToolBase):""" A tool that provides tools to search Google and visit webpages. Renders webpages in Markdown and has basic support for reading PDFs. """# app-global browser instanceplaywright=Nonebrowser=Nonebrowser_context=Nonedef__init__(self,*args,long_engine:BaseEngine=None,max_webpage_len:int=None,**kwargs):super().__init__(*args,**kwargs)self.http=httpx.AsyncClient(follow_redirects=True)self.page:Optional["Page"]=Noneself.long_engine=long_engine# the max number of tokens before asking for a summary - default 1/3rd ctx lenifmax_webpage_lenisNone:max_webpage_len=self.kani.engine.max_context_size//3self.max_webpage_len=max_webpage_len# content handlersself.content_handlers={"application/pdf":self.pdf_content,"application/json":self.json_content,"text/":self.html_content,}# === resources + app lifecycle ===# noinspection PyMethodMayBeStaticasyncdefget_browser(self,**kwargs)->BrowserContext:"""Get the current active browser context, or launch it on the first call."""ifBrowsing.playwrightisNone:Browsing.playwright=awaitasync_playwright().start()ifBrowsing.browserisNone:Browsing.browser=awaitBrowsing.playwright.chromium.launch(**kwargs)ifBrowsing.browser_contextisNone:Browsing.browser_context=awaitBrowsing.browser.new_context()returnBrowsing.browser_contextasyncdefget_page(self,create=True)->Optional["Page"]:"""Get the current page. Returns None if the browser is not on a page unless `create` is True, in which case it creates a new page. """ifself.pageisNoneandcreate:context=awaitself.get_browser()self.page=awaitcontext.new_page()returnself.pageasyncdefcleanup(self):awaitsuper().cleanup()ifself.pageisnotNone:awaitself.page.close()self.page=Noneasyncdefclose(self):awaitsuper().close()try:if(browser:=Browsing.browser)isnotNone:Browsing.browser=Noneawaitbrowser.close()if(pw:=Browsing.playwright)isnotNone:Browsing.playwright=Noneawaitpw.stop()exceptPlaywrightError:# sometimes playwright doesn't like closing in parallelpass# ==== functions ====
[docs]@ai_function()asyncdefsearch(self,query:str):"""Search a query on Google."""page=awaitself.get_page()query_enc=urllib.parse.quote_plus(query)awaitpage.goto(f"https://www.google.com/search?q={query_enc}")# contenttry:# if the main content is borked, fallbacksearch_html=awaitpage.inner_html("#main",timeout=5000)search_text=web_markdownify(search_html,include_links=False)# linkssearch_loc=page.locator("#search")links=awaitget_google_links(search_loc)return(f"{search_text.strip()}\n\nYou should visit some of these links for more information or delegate"f" helpers to visit multiple:\n\n===== Links =====\n{links.to_md_str()}")exceptPlaywrightTimeoutError:content_html=awaitpage.content()content=web_markdownify(content_html)returncontent
[docs]@ai_function()asyncdefvisit_page(self,href:str):"""Visit a web page and view its contents."""# first, let's do a HEAD request and get the content-type so we know how to actually process the inforesp=awaitself.http.head(href)content_type=resp.headers.get("Content-Type","").lower()# then delegate to the content type handlerhandler=next((ffort,finself.content_handlers.items()ifcontent_type.startswith(t)),None)ifhandlerisNone:log.warning(f"Could not find handler for content type: {content_type}")handler=self.html_contentreturnawaithandler(href)
# ==== content renderers ====asyncdefpdf_content(self,href:str)->str:"""Handler for application/pdf content types."""withtempfile.NamedTemporaryFile()asf:# download into a tempfileasyncwithself.http.stream("GET",href)asresponse:asyncforchunkinresponse.aiter_bytes():f.write(chunk)# then read itdoc=pymupdf.open(f.name,filetype="pdf")content=pymupdf4llm.to_markdown(doc)# summarizationcontent=awaitself.maybe_summarize(content)returncontentasyncdefjson_content(self,href:str)->str:"""Handler for application/json content types."""resp=awaitself.http.get(href)resp.raise_for_status()awaitresp.aread()returnresp.textasyncdefhtml_content(self,href:str)->str:"""Default handler for all other content types."""page=awaitself.get_page()awaitpage.goto(href)withcontextlib.suppress(PlaywrightTimeoutError):awaitpage.wait_for_load_state("networkidle",timeout=10_000)# headertitle=awaitpage.title()header=f"{title}\n{'='*len(title)}\n{page.url}\n\n"content_html=awaitpage.content()content=web_markdownify(content_html)# summarizationcontent=awaitself.maybe_summarize(content)# resultresult=header+contentreturnresult# ==== helpers ====asyncdefmaybe_summarize(self,content,max_len=None):max_len=max_lenorself.max_webpage_lenifself.kani.message_token_len(ChatMessage.function("visit_page",content))>max_len:msg_ctx="\n\n".join(m.textforminself.kani.chat_historyifm.role!=ChatRole.FUNCTIONandm.textisnotNone)content=awaitweb_summarize(content,parent=self.kani,long_engine=self.long_engineorself.kani.engine,task=("Keep the current context in mind:\n"f"<context>\n{msg_ctx}\n</context>\n\n""Keeping the context and task in mind, please summarize the main content above."),)returncontent