AI 项目在线编辑器

<?php
declare(strict_types=1);

/**
 * 通用网页抓取 + 文字提取工具（简化版）
 * - mode=text: 返回提取后的正文（默认）
 * - mode=full: 附带更多信息（状态码、headers、html片段等）
 */

function is_private_ip(string $ip): bool
{
  // IPv4 only (简化版)
  if (!filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4)) return true;

$long = ip2long($ip);
  if ($long === false) return true;

$ranges = [
    ['0.0.0.0', '0.255.255.255'],       // this network
    ['10.0.0.0', '10.255.255.255'],     // private
    ['127.0.0.0', '127.255.255.255'],   // loopback
    ['169.254.0.0', '169.254.255.255'], // link-local
    ['172.16.0.0', '172.31.255.255'],   // private
    ['192.168.0.0', '192.168.255.255'], // private
  ];

foreach ($ranges as [$start, $end]) {
    $s = ip2long($start); $e = ip2long($end);
    if ($s !== false && $e !== false && $long >= $s && $long <= $e) return true;
  }
  return false;
}

function validate_url_safe(string $url): void
{
  $u = parse_url($url);
  if (!is_array($u) || empty($u['scheme']) || empty($u['host'])) {
    throw new RuntimeException('URL 不合法');
  }
  $scheme = strtolower((string)$u['scheme']);
  if (!in_array($scheme, ['http', 'https'], true)) {
    throw new RuntimeException('仅允许 http/https');
  }

$host = (string)$u['host'];
  // 禁止显式 localhost
  if ($host === 'localhost') throw new RuntimeException('禁止访问 localhost');

// 解析 DNS，阻止内网 IP（简化：只取 A 记录的第一个）
  $ip = gethostbyname($host);
  if ($ip === $host) {
    throw new RuntimeException('DNS 解析失败');
  }
  if (is_private_ip($ip)) {
    throw new RuntimeException('禁止访问内网/保留地址');
  }
}

function http_get(string $url, int $timeoutSec, int $maxBytes, string $userAgent): array
{
  $headers = [];
  $ch = curl_init($url);
  if ($ch === false) throw new RuntimeException('curl_init failed');

curl_setopt_array($ch, [
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_CONNECTTIMEOUT => $timeoutSec,
    CURLOPT_TIMEOUT => $timeoutSec,
    CURLOPT_FOLLOWLOCATION => true,
    CURLOPT_MAXREDIRS => 5,
    CURLOPT_USERAGENT => $userAgent,
    CURLOPT_HEADERFUNCTION => function($ch, $headerLine) use (&$headers) {
      $len = strlen($headerLine);
      $headerLine = trim($headerLine);
      if ($headerLine === '' || strpos($headerLine, ':') === false) return $len;
      [$k, $v] = explode(':', $headerLine, 2);
      $k = strtolower(trim($k));
      $v = trim($v);
      if (!isset($headers[$k])) $headers[$k] = $v;
      return $len;
    },
  ]);

// 限制下载大小（通过 writefunction 截断）
  $body = '';
  curl_setopt($ch, CURLOPT_WRITEFUNCTION, function($ch, $chunk) use (&$body, $maxBytes) {
    $remain = $maxBytes - strlen($body);
    if ($remain <= 0) return 0; // stop
    if (strlen($chunk) > $remain) $chunk = substr($chunk, 0, $remain);
    $body .= $chunk;
    return strlen($chunk);
  });

curl_exec($ch);
  $errno = curl_errno($ch);
  $err = curl_error($ch);
  $status = (int)curl_getinfo($ch, CURLINFO_HTTP_CODE);
  $finalUrl = (string)curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
  $contentType = (string)curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
  curl_close($ch);

if ($errno !== 0) throw new RuntimeException('cURL error: ' . $err);
  if ($status < 200 || $status >= 300) {
    return [
      'ok' => false,
      'status' => $status,
      'final_url' => $finalUrl,
      'content_type' => $contentType,
      'headers' => $headers,
      'body' => $body,
    ];
  }

return [
    'ok' => true,
    'status' => $status,
    'final_url' => $finalUrl,
    'content_type' => $contentType,
    'headers' => $headers,
    'body' => $body,
  ];
}

function html_to_text_simple(string $html, int $maxChars): array
{
  // 尝试从 meta charset / http-equiv 获取编码很复杂，这里先假设 UTF-8；不对时可能乱码（简单版的代价）
  $title = '';
  $desc = '';

libxml_use_internal_errors(true);
  $dom = new DOMDocument();

// 防止因非法字符解析失败：尽量转为 HTML-ENTITIES
  $loaded = $dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
  if (!$loaded) {
    return ['title' => '', 'description' => '', 'text' => mb_substr(strip_tags($html), 0, $maxChars)];
  }

// title
  $titles = $dom->getElementsByTagName('title');
  if ($titles->length > 0) $title = trim((string)$titles->item(0)->textContent);

// meta description
  $metas = $dom->getElementsByTagName('meta');
  foreach ($metas as $m) {
    $name = strtolower((string)$m->getAttribute('name'));
    if ($name === 'description') {
      $desc = trim((string)$m->getAttribute('content'));
      break;
    }
  }

// 去除明显无关节点
  $xpath = new DOMXPath($dom);
  $removeQueries = [
    '//script', '//style', '//noscript', '//svg', '//canvas',
    '//header', '//footer', '//nav', '//*[contains(@class,"nav")]',
    '//*[contains(@class,"footer")]', '//*[contains(@class,"header")]',
    '//*[contains(@class,"sidebar")]', '//*[contains(@id,"nav")]',
    '//*[contains(@id,"footer")]', '//*[contains(@id,"header")]',
    '//*[contains(@id,"sidebar")]',
  ];
  foreach ($removeQueries as $q) {
    foreach ($xpath->query($q) as $node) {
      $node->parentNode?->removeChild($node);
    }
  }

// 主体文本：优先 body
  $bodyNodes = $dom->getElementsByTagName('body');
  $text = '';
  if ($bodyNodes->length > 0) {
    $text = (string)$bodyNodes->item(0)->textContent;
  } else {
    $text = (string)$dom->textContent;
  }

// 清洗空白
  $text = preg_replace('/[ \t]+/', ' ', $text ?? '') ?? '';
  $text = preg_replace('/\R{3,}/', "\n\n", $text) ?? '';
  $text = trim($text);

if (mb_strlen($text) > $maxChars) {
    $text = mb_substr($text, 0, $maxChars) . "\n\n...[truncated]";
  }

if ($desc !== '' && mb_strlen($desc) > 500) {
    $desc = mb_substr($desc, 0, 500) . '...[truncated]';
  }

return ['title' => $title, 'description' => $desc, 'text' => $text];
}

return [
  'name' => 'fetch_url_text',
  'description' => '访问给定 URL 并提取可读文本。mode=text 返回正文；mode=full 返回更多网页信息（headers、html片段等）。',
  'parameters' => [
    'type' => 'object',
    'properties' => [
      'url' => [
        'type' => 'string',
        'description' => '要访问的网页 URL（http/https）',
      ],
      'mode' => [
        'type' => 'string',
        'enum' => ['text', 'full'],
        'description' => 'text=只返回正文（默认）；full=返回更完整信息（含部分原始HTML）',
      ],
      'timeout_sec' => [
        'type' => 'integer',
        'description' => '超时秒数（默认 12）',
      ],
      'max_bytes' => [
        'type' => 'integer',
        'description' => '最多下载字节数（默认 1200000，大约 1.2MB）',
      ],
      'max_chars' => [
        'type' => 'integer',
        'description' => '最多返回文本字符数（默认 12000）',
      ],
    ],
    'required' => ['url'],
  ],
  'run' => function(array $args, array $context) {

$url = trim((string)($args['url'] ?? ''));
    if ($url === '') throw new RuntimeException('url 不能为空');

$mode = (string)($args['mode'] ?? 'text');
    if ($mode !== 'text' && $mode !== 'full') $mode = 'text';

$timeoutSec = (int)($args['timeout_sec'] ?? 12);
    if ($timeoutSec <= 0) $timeoutSec = 12;

$maxBytes = (int)($args['max_bytes'] ?? 1200000);
    if ($maxBytes < 20000) $maxBytes = 20000;

$maxChars = (int)($args['max_chars'] ?? 12000);
    if ($maxChars < 1000) $maxChars = 1000;

// 基础安全校验（建议保留）
    validate_url_safe($url);

$ua = 'Mozilla/5.0 (compatible; fetch_url_text/1.0)';

$resp = http_get($url, $timeoutSec, $maxBytes, $ua);

$ct = strtolower($resp['content_type'] ?? '');
    $body = (string)($resp['body'] ?? '');

// 非 2xx 直接返回（full 会带更多信息）
    if (!$resp['ok']) {
      $out = [
        'ok' => false,
        'status' => $resp['status'],
        'final_url' => $resp['final_url'],
        'content_type' => $resp['content_type'],
      ];
      if ($mode === 'full') {
        $out['headers'] = $resp['headers'];
        $out['body_snippet'] = mb_substr($body, 0, min(2000, mb_strlen($body)));
      }
      return $out;
    }

// 只做 HTML 文本抽取（简单版）；其他 content-type 直接 strip_tags 兜底
    $extracted = ['title' => '', 'description' => '', 'text' => ''];
    if (str_contains($ct, 'text/html') || str_contains($ct, 'application/xhtml')) {
      $extracted = html_to_text_simple($body, $maxChars);
    } else {
      $plain = trim(preg_replace('/\s+/', ' ', strip_tags($body)) ?? '');
      if (mb_strlen($plain) > $maxChars) $plain = mb_substr($plain, 0, $maxChars) . '...[truncated]';
      $extracted = ['title' => '', 'description' => '', 'text' => $plain];
    }

$out = [
      'ok' => true,
      'status' => $resp['status'],
      'final_url' => $resp['final_url'],
      'content_type' => $resp['content_type'],
      'title' => $extracted['title'],
      'description' => $extracted['description'],
      'text' => $extracted['text'],
    ];

if ($mode === 'full') {
      $out['headers'] = $resp['headers'];
      // 避免返回过大：只给一段 html 片段
      $out['html_snippet'] = mb_substr($body, 0, min(20000, mb_strlen($body)));
      $out['downloaded_bytes'] = strlen($body);
    }

return $out;
  },
];

Models

Tools