liusijin revised this gist 1 month ago. Go to revision
1 file changed, 35 insertions
web_fetch.py(file created)
| @@ -0,0 +1,35 @@ | |||
| 1 | + | #!/usr/bin/env python3 | |
| 2 | + | """Web fetch extension - fetch content from URLs""" | |
| 3 | + | ||
| 4 | + | import urllib.request | |
| 5 | + | import re | |
| 6 | + | ||
| 7 | + | @register_tool( | |
| 8 | + | "fetch", | |
| 9 | + | "Fetch content from URL (supports http/https)", | |
| 10 | + | {"url": "string", "timeout": "number?"} | |
| 11 | + | ) | |
| 12 | + | def fetch(args): | |
| 13 | + | """Fetch web content""" | |
| 14 | + | url = args["url"] | |
| 15 | + | timeout = args.get("timeout", 10) | |
| 16 | + | ||
| 17 | + | try: | |
| 18 | + | req = urllib.request.Request(url) | |
| 19 | + | req.add_header('User-Agent', 'nanocode/1.0') | |
| 20 | + | response = urllib.request.urlopen(req, timeout=timeout) | |
| 21 | + | content = response.read().decode('utf-8', errors='ignore') | |
| 22 | + | ||
| 23 | + | # 简单清理HTML | |
| 24 | + | content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL) | |
| 25 | + | content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL) | |
| 26 | + | content = re.sub(r'<[^>]+>', ' ', content) | |
| 27 | + | content = re.sub(r'\s+', ' ', content).strip() | |
| 28 | + | ||
| 29 | + | # 限制长度 | |
| 30 | + | if len(content) > 5000: | |
| 31 | + | content = content[:5000] + f"\n... (truncated, total {len(content)} chars)" | |
| 32 | + | ||
| 33 | + | return content | |
| 34 | + | except Exception as e: | |
| 35 | + | return f"error: {e}" | |
Newer
Older