Last active 1 month ago

fetch content from URLs

web_fetch.py Raw
1#!/usr/bin/env python3
2"""Web fetch extension - fetch content from URLs"""
3
4import urllib.request
5import re
6
7@register_tool(
8 "fetch",
9 "Fetch content from URL (supports http/https)",
10 {"url": "string", "timeout": "number?"}
11)
12def fetch(args):
13 """Fetch web content"""
14 url = args["url"]
15 timeout = args.get("timeout", 10)
16
17 try:
18 req = urllib.request.Request(url)
19 req.add_header('User-Agent', 'nanocode/1.0')
20 response = urllib.request.urlopen(req, timeout=timeout)
21 content = response.read().decode('utf-8', errors='ignore')
22
23 # 简单清理HTML
24 content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL)
25 content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL)
26 content = re.sub(r'<[^>]+>', ' ', content)
27 content = re.sub(r'\s+', ' ', content).strip()
28
29 # 限制长度
30 if len(content) > 5000:
31 content = content[:5000] + f"\n... (truncated, total {len(content)} chars)"
32
33 return content
34 except Exception as e:
35 return f"error: {e}"