← 返回聊天
新建
删除
Models
gpt5.php
gpt5_file.php
gpt5_mini_file.php
openai_chat.php
Tools
get_time.php
get_weather.php
global_search_messages.php
math.php
memo.php
news.php
search_arxiv.php
search_crossref.php
search_github_code.php
search_pubmed.php
search_semantic_scholar.php
stock_market.php
url.php
<?php declare(strict_types=1); /** * 通用网页抓取 + 文字提取工具(简化版) * - mode=text: 返回提取后的正文(默认) * - mode=full: 附带更多信息(状态码、headers、html片段等) */ function is_private_ip(string $ip): bool { // IPv4 only (简化版) if (!filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4)) return true; $long = ip2long($ip); if ($long === false) return true; $ranges = [ ['0.0.0.0', '0.255.255.255'], // this network ['10.0.0.0', '10.255.255.255'], // private ['127.0.0.0', '127.255.255.255'], // loopback ['169.254.0.0', '169.254.255.255'], // link-local ['172.16.0.0', '172.31.255.255'], // private ['192.168.0.0', '192.168.255.255'], // private ]; foreach ($ranges as [$start, $end]) { $s = ip2long($start); $e = ip2long($end); if ($s !== false && $e !== false && $long >= $s && $long <= $e) return true; } return false; } function validate_url_safe(string $url): void { $u = parse_url($url); if (!is_array($u) || empty($u['scheme']) || empty($u['host'])) { throw new RuntimeException('URL 不合法'); } $scheme = strtolower((string)$u['scheme']); if (!in_array($scheme, ['http', 'https'], true)) { throw new RuntimeException('仅允许 http/https'); } $host = (string)$u['host']; // 禁止显式 localhost if ($host === 'localhost') throw new RuntimeException('禁止访问 localhost'); // 解析 DNS,阻止内网 IP(简化:只取 A 记录的第一个) $ip = gethostbyname($host); if ($ip === $host) { throw new RuntimeException('DNS 解析失败'); } if (is_private_ip($ip)) { throw new RuntimeException('禁止访问内网/保留地址'); } } function http_get(string $url, int $timeoutSec, int $maxBytes, string $userAgent): array { $headers = []; $ch = curl_init($url); if ($ch === false) throw new RuntimeException('curl_init failed'); curl_setopt_array($ch, [ CURLOPT_RETURNTRANSFER => true, CURLOPT_CONNECTTIMEOUT => $timeoutSec, CURLOPT_TIMEOUT => $timeoutSec, CURLOPT_FOLLOWLOCATION => true, CURLOPT_MAXREDIRS => 5, CURLOPT_USERAGENT => $userAgent, CURLOPT_HEADERFUNCTION => function($ch, $headerLine) use (&$headers) { $len = strlen($headerLine); $headerLine = trim($headerLine); if ($headerLine === '' || strpos($headerLine, ':') === false) return $len; [$k, $v] = explode(':', $headerLine, 2); $k = strtolower(trim($k)); $v = trim($v); if (!isset($headers[$k])) $headers[$k] = $v; return $len; }, ]); // 限制下载大小(通过 writefunction 截断) $body = ''; curl_setopt($ch, CURLOPT_WRITEFUNCTION, function($ch, $chunk) use (&$body, $maxBytes) { $remain = $maxBytes - strlen($body); if ($remain <= 0) return 0; // stop if (strlen($chunk) > $remain) $chunk = substr($chunk, 0, $remain); $body .= $chunk; return strlen($chunk); }); curl_exec($ch); $errno = curl_errno($ch); $err = curl_error($ch); $status = (int)curl_getinfo($ch, CURLINFO_HTTP_CODE); $finalUrl = (string)curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); $contentType = (string)curl_getinfo($ch, CURLINFO_CONTENT_TYPE); curl_close($ch); if ($errno !== 0) throw new RuntimeException('cURL error: ' . $err); if ($status < 200 || $status >= 300) { return [ 'ok' => false, 'status' => $status, 'final_url' => $finalUrl, 'content_type' => $contentType, 'headers' => $headers, 'body' => $body, ]; } return [ 'ok' => true, 'status' => $status, 'final_url' => $finalUrl, 'content_type' => $contentType, 'headers' => $headers, 'body' => $body, ]; } function html_to_text_simple(string $html, int $maxChars): array { // 尝试从 meta charset / http-equiv 获取编码很复杂,这里先假设 UTF-8;不对时可能乱码(简单版的代价) $title = ''; $desc = ''; libxml_use_internal_errors(true); $dom = new DOMDocument(); // 防止因非法字符解析失败:尽量转为 HTML-ENTITIES $loaded = $dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); if (!$loaded) { return ['title' => '', 'description' => '', 'text' => mb_substr(strip_tags($html), 0, $maxChars)]; } // title $titles = $dom->getElementsByTagName('title'); if ($titles->length > 0) $title = trim((string)$titles->item(0)->textContent); // meta description $metas = $dom->getElementsByTagName('meta'); foreach ($metas as $m) { $name = strtolower((string)$m->getAttribute('name')); if ($name === 'description') { $desc = trim((string)$m->getAttribute('content')); break; } } // 去除明显无关节点 $xpath = new DOMXPath($dom); $removeQueries = [ '//script', '//style', '//noscript', '//svg', '//canvas', '//header', '//footer', '//nav', '//*[contains(@class,"nav")]', '//*[contains(@class,"footer")]', '//*[contains(@class,"header")]', '//*[contains(@class,"sidebar")]', '//*[contains(@id,"nav")]', '//*[contains(@id,"footer")]', '//*[contains(@id,"header")]', '//*[contains(@id,"sidebar")]', ]; foreach ($removeQueries as $q) { foreach ($xpath->query($q) as $node) { $node->parentNode?->removeChild($node); } } // 主体文本:优先 body $bodyNodes = $dom->getElementsByTagName('body'); $text = ''; if ($bodyNodes->length > 0) { $text = (string)$bodyNodes->item(0)->textContent; } else { $text = (string)$dom->textContent; } // 清洗空白 $text = preg_replace('/[ \t]+/', ' ', $text ?? '') ?? ''; $text = preg_replace('/\R{3,}/', "\n\n", $text) ?? ''; $text = trim($text); if (mb_strlen($text) > $maxChars) { $text = mb_substr($text, 0, $maxChars) . "\n\n...[truncated]"; } if ($desc !== '' && mb_strlen($desc) > 500) { $desc = mb_substr($desc, 0, 500) . '...[truncated]'; } return ['title' => $title, 'description' => $desc, 'text' => $text]; } return [ 'name' => 'fetch_url_text', 'description' => '访问给定 URL 并提取可读文本。mode=text 返回正文;mode=full 返回更多网页信息(headers、html片段等)。', 'parameters' => [ 'type' => 'object', 'properties' => [ 'url' => [ 'type' => 'string', 'description' => '要访问的网页 URL(http/https)', ], 'mode' => [ 'type' => 'string', 'enum' => ['text', 'full'], 'description' => 'text=只返回正文(默认);full=返回更完整信息(含部分原始HTML)', ], 'timeout_sec' => [ 'type' => 'integer', 'description' => '超时秒数(默认 12)', ], 'max_bytes' => [ 'type' => 'integer', 'description' => '最多下载字节数(默认 1200000,大约 1.2MB)', ], 'max_chars' => [ 'type' => 'integer', 'description' => '最多返回文本字符数(默认 12000)', ], ], 'required' => ['url'], ], 'run' => function(array $args, array $context) { $url = trim((string)($args['url'] ?? '')); if ($url === '') throw new RuntimeException('url 不能为空'); $mode = (string)($args['mode'] ?? 'text'); if ($mode !== 'text' && $mode !== 'full') $mode = 'text'; $timeoutSec = (int)($args['timeout_sec'] ?? 12); if ($timeoutSec <= 0) $timeoutSec = 12; $maxBytes = (int)($args['max_bytes'] ?? 1200000); if ($maxBytes < 20000) $maxBytes = 20000; $maxChars = (int)($args['max_chars'] ?? 12000); if ($maxChars < 1000) $maxChars = 1000; // 基础安全校验(建议保留) validate_url_safe($url); $ua = 'Mozilla/5.0 (compatible; fetch_url_text/1.0)'; $resp = http_get($url, $timeoutSec, $maxBytes, $ua); $ct = strtolower($resp['content_type'] ?? ''); $body = (string)($resp['body'] ?? ''); // 非 2xx 直接返回(full 会带更多信息) if (!$resp['ok']) { $out = [ 'ok' => false, 'status' => $resp['status'], 'final_url' => $resp['final_url'], 'content_type' => $resp['content_type'], ]; if ($mode === 'full') { $out['headers'] = $resp['headers']; $out['body_snippet'] = mb_substr($body, 0, min(2000, mb_strlen($body))); } return $out; } // 只做 HTML 文本抽取(简单版);其他 content-type 直接 strip_tags 兜底 $extracted = ['title' => '', 'description' => '', 'text' => '']; if (str_contains($ct, 'text/html') || str_contains($ct, 'application/xhtml')) { $extracted = html_to_text_simple($body, $maxChars); } else { $plain = trim(preg_replace('/\s+/', ' ', strip_tags($body)) ?? ''); if (mb_strlen($plain) > $maxChars) $plain = mb_substr($plain, 0, $maxChars) . '...[truncated]'; $extracted = ['title' => '', 'description' => '', 'text' => $plain]; } $out = [ 'ok' => true, 'status' => $resp['status'], 'final_url' => $resp['final_url'], 'content_type' => $resp['content_type'], 'title' => $extracted['title'], 'description' => $extracted['description'], 'text' => $extracted['text'], ]; if ($mode === 'full') { $out['headers'] = $resp['headers']; // 避免返回过大:只给一段 html 片段 $out['html_snippet'] = mb_substr($body, 0, min(20000, mb_strlen($body))); $out['downloaded_bytes'] = strlen($body); } return $out; }, ];
保存