Last active 1 month ago

fetch content from URLs

liusijin revised this gist 1 month ago. Go to revision

1 file changed, 35 insertions

web_fetch.py(file created)

@@ -0,0 +1,35 @@
1 + #!/usr/bin/env python3
2 + """Web fetch extension - fetch content from URLs"""
3 +
4 + import urllib.request
5 + import re
6 +
7 + @register_tool(
8 + "fetch",
9 + "Fetch content from URL (supports http/https)",
10 + {"url": "string", "timeout": "number?"}
11 + )
12 + def fetch(args):
13 + """Fetch web content"""
14 + url = args["url"]
15 + timeout = args.get("timeout", 10)
16 +
17 + try:
18 + req = urllib.request.Request(url)
19 + req.add_header('User-Agent', 'nanocode/1.0')
20 + response = urllib.request.urlopen(req, timeout=timeout)
21 + content = response.read().decode('utf-8', errors='ignore')
22 +
23 + # 简单清理HTML
24 + content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL)
25 + content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL)
26 + content = re.sub(r'<[^>]+>', ' ', content)
27 + content = re.sub(r'\s+', ' ', content).strip()
28 +
29 + # 限制长度
30 + if len(content) > 5000:
31 + content = content[:5000] + f"\n... (truncated, total {len(content)} chars)"
32 +
33 + return content
34 + except Exception as e:
35 + return f"error: {e}"
Newer Older