name: web-scraper description: 从网页提取和处理数据，使用CSS选择器、XPath智能解析，支持限速和错误处理。 metadata: short-description: 从网页提取数据

Web Scraper Skill

描述

从网页高效、智能地提取和处理数据。

触发条件

/scrape 命令
用户请求网页数据提取
用户需要解析HTML

提示

你是一个网页抓取专家，能够高效且符合道德规范地提取数据。

Puppeteer 抓取器 (TypeScript)

import puppeteer from 'puppeteer';

interface Product {
  name: string;
  price: number;
  rating: number;
  url: string;
}

async function scrapeProducts(url: string): Promise<Product[]> {
  const browser = await puppeteer.launch({ headless: 'new' });
  const page = await browser.newPage();
  
  // 设置用户代理以避免被检测
  await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
  
  await page.goto(url, { waitUntil: 'networkidle2' });
  
  // 等待产品加载
  await page.waitForSelector('.product-card');
  
  const products = await page.evaluate(() => {
    const items = document.querySelectorAll('.product-card');
    return Array.from(items).map(item => ({
      name: item.querySelector('.product-name')?.textContent?.trim() ?? '',
      price: parseFloat(item.querySelector('.price')?.textContent?.replace('$', '') ?? '0'),
      rating: parseFloat(item.querySelector('.rating')?.getAttribute('data-rating') ?? '0'),
      url: item.querySelector('a')?.href ?? '',
    }));
  });
  
  await browser.close();
  return products;
}

Cheerio 解析器 (Node.js)

import axios from 'axios';
import * as cheerio from 'cheerio';

async function parseArticle(url: string) {
  const { data } = await axios.get(url, {
    headers: { 'User-Agent': 'Mozilla/5.0' }
  });
  
  const $ = cheerio.load(data);
  
  return {
    title: $('h1.article-title').text().trim(),
    author: $('span.author-name').text().trim(),
    date: $('time').attr('datetime'),
    content: $('article.content p').map((_, el) => $(el).text()).get().join('

'),
    tags: $('a.tag').map((_, el) => $(el).text()).get(),
  };
}

速率限制

class RateLimiter {
  private queue: (() => Promise<void>)[] = [];
  private processing = false;
  
  constructor(private delayMs: number = 1000) {}
  
  async add<T>(fn: () => Promise<T>): Promise<T> {
    return new Promise((resolve, reject) => {
      this.queue.push(async () => {
        try {
          resolve(await fn());
        } catch (e) {
          reject(e);
        }
      });
      this.process();
    });
  }
  
  private async process() {
    if (this.processing) return;
    this.processing = true;
    
    while (this.queue.length > 0) {
      const fn = this.queue.shift()!;
      await fn();
      await new Promise(r => setTimeout(r, this.delayMs));
    }
    
    this.processing = false;
  }
}

// 使用示例
const limiter = new RateLimiter(2000); // 请求间隔2秒
const results = await Promise.all(
  urls.map(url => limiter.add(() => scrapeProducts(url)))
);

兼容性

Codex: ✅
Claude Code: ✅

网页抓取器Skill web-scraper

name: web-scraper description: 从网页提取和处理数据，使用CSS选择器、XPath智能解析，支持限速和错误处理。 metadata: short-description: 从网页提取数据

Web Scraper Skill

描述

触发条件

提示

Puppeteer 抓取器 (TypeScript)

Cheerio 解析器 (Node.js)

速率限制

标签

兼容性