Spaces:

heysho
/

web-browsing-agent

Running

App Files Files Community

heysho commited on Sep 7

Commit

5949f3d

•

1 Parent(s): 1c2e898

Upload 2 files

Browse files

Files changed (2) hide show

tools/fetch_page.py +88 -0
tools/search_ddg.py +41 -0

tools/fetch_page.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import requests
+import html2text
+from readability import Document
+from langchain_core.tools import tool
+from langchain_core.pydantic_v1 import (BaseModel, Field)
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+class FetchPageInput(BaseModel):
+    url: str = Field()
+    page_num: int = Field(0, ge=0)
+@tool(args_schema=FetchPageInput)
+def fetch_page(url, page_num=0, timeout_sec=10):
+    """
+    指定されたURLから（とページ番号から）ウェブページのコンテンツを取得するツール。
+    `status` と `page_content`（`title`、`content`、`has_next`インジケーター）を返します。
+    statusが200でない場合は、ページの取得時にエラーが発生しています。（他のページの取得を試みてください）
+    デフォルトでは、最大2,000トークンのコンテンツのみが取得されます。
+    ページにさらにコンテンツがある場合、`has_next`の値はTrueになります。
+    続きを読むには、同じURLで`page_num`パラメータをインクリメントして、再度入力してください。
+    （ページングは0から始まるので、次のページは1です）
+    1ページが長すぎる場合は、**3回以上取得しないでください**（メモリの負荷がかかるため）。
+    Returns
+    -------
+    Dict[str, Any]:
+    - status: str
+    - page_content
+      - title: str
+      - content: str
+      - has_next: bool
+    """
+    try:
+        response = requests.get(url, timeout=timeout_sec)
+        response.encoding = 'utf-8'
+    except requests.exceptions.Timeout:
+        return {
+            "status": 500,
+            "page_content": {'error_message': 'Could not download page due to Timeout Error. Please try to fetch other pages.'}
+        }
+    if response.status_code != 200:
+        return {
+            "status": response.status_code,
+            "page_content": {'error_message': 'Could not download page. Please try to fetch other pages.'}
+        }
+    try:
+        doc = Document(response.text)
+        title = doc.title()
+        html_content = doc.summary()
+        content = html2text.html2text(html_content)
+    except:
+        return {
+            "status": 500,
+            "page_content": {'error_message': 'Could not parse page. Please try to fetch other pages.'}
+        }
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+        model_name='gpt-4o-mini',
+        chunk_size=1000,
+        chunk_overlap=0,
+    )
+    chunks = text_splitter.split_text(content)
+    if page_num >= len(chunks):
+        return {
+            "status": 500,
+            "page_content": {'error_message': 'page_num parameter looks invalid. Please try to fetch other pages.'}
+        }
+    elif page_num >= 3:
+        return {
+            "status": 503,
+            "page_content": {'error_message': "Reading more of the page_num's content will overload your memory. Please provide your response based on the information you currently have."}
+        }
+    else:
+        return {
+            "status": 200,
+            "page_content": {
+                "title": title,
+                "content": chunks[page_num],
+                "has_next": page_num < len(chunks) - 1
+            }
+        }

tools/search_ddg.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from itertools import islice
+from duckduckgo_search import DDGS
+from langchain_core.tools import tool
+from langchain_core.pydantic_v1 import (BaseModel, Field)
+class SearchDDGInput(BaseModel):
+    query: str = Field(description="検索したいキーワードを入力してください")
+@tool(args_schema=SearchDDGInput)
+def search_ddg(query, max_result_num=5):
+    """
+    DuckDuckGo検索を実行するためのツールです。
+    検索したいキーワードを入力して使用してください。
+    検索結果の各ページのタイトル、スニペット（説明文）、URLが返されます。
+    このツールから得られる情報は非常に簡素化されており、時には古い情報の場合もあります。
+    必要な情報が見つからない場合は、必ず `WEB Page Fetcher` ツールを使用して各ページの内容を確認してください。
+    文脈に応じて最も適切な言語を使用してください（ユーザーの言語と同じである必要はありません）。
+    例えば、プログラミング関連の質問では、英語で検索するのが最適です。
+    Returns
+    -------
+    List[Dict[str, str]]:
+    - title
+    - snippet
+    - url
+    """
+    res = DDGS().text(query + " latest", region='JP', safesearch='off', backend="lite")
+    return [
+        {
+            "title": r.get('title', ""),
+            "snippet": r.get('body', ""),
+            "url": r.get('href', "")
+        }
+        for r in islice(res, max_result_num)
+    ]