name: web-scraper description: 从网页提取和处理数据,使用CSS选择器、XPath智能解析,支持限速和错误处理。 metadata: short-description: 从网页提取数据
Web Scraper Skill
描述
从网页高效、智能地提取和处理数据。
触发条件
/scrape命令- 用户请求网页数据提取
- 用户需要解析HTML
提示
你是一个网页抓取专家,能够高效且符合道德规范地提取数据。
Puppeteer 抓取器 (TypeScript)
import puppeteer from 'puppeteer';
interface Product {
name: string;
price: number;
rating: number;
url: string;
}
async function scrapeProducts(url: string): Promise<Product[]> {
const browser = await puppeteer.launch({ headless: 'new' });
const page = await browser.newPage();
// 设置用户代理以避免被检测
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
await page.goto(url, { waitUntil: 'networkidle2' });
// 等待产品加载
await page.waitForSelector('.product-card');
const products = await page.evaluate(() => {
const items = document.querySelectorAll('.product-card');
return Array.from(items).map(item => ({
name: item.querySelector('.product-name')?.textContent?.trim() ?? '',
price: parseFloat(item.querySelector('.price')?.textContent?.replace('$', '') ?? '0'),
rating: parseFloat(item.querySelector('.rating')?.getAttribute('data-rating') ?? '0'),
url: item.querySelector('a')?.href ?? '',
}));
});
await browser.close();
return products;
}
Cheerio 解析器 (Node.js)
import axios from 'axios';
import * as cheerio from 'cheerio';
async function parseArticle(url: string) {
const { data } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0' }
});
const $ = cheerio.load(data);
return {
title: $('h1.article-title').text().trim(),
author: $('span.author-name').text().trim(),
date: $('time').attr('datetime'),
content: $('article.content p').map((_, el) => $(el).text()).get().join('
'),
tags: $('a.tag').map((_, el) => $(el).text()).get(),
};
}
速率限制
class RateLimiter {
private queue: (() => Promise<void>)[] = [];
private processing = false;
constructor(private delayMs: number = 1000) {}
async add<T>(fn: () => Promise<T>): Promise<T> {
return new Promise((resolve, reject) => {
this.queue.push(async () => {
try {
resolve(await fn());
} catch (e) {
reject(e);
}
});
this.process();
});
}
private async process() {
if (this.processing) return;
this.processing = true;
while (this.queue.length > 0) {
const fn = this.queue.shift()!;
await fn();
await new Promise(r => setTimeout(r, this.delayMs));
}
this.processing = false;
}
}
// 使用示例
const limiter = new RateLimiter(2000); // 请求间隔2秒
const results = await Promise.all(
urls.map(url => limiter.add(() => scrapeProducts(url)))
);
标签
网页抓取, 数据提取, 解析, 自动化, html
兼容性
- Codex: ✅
- Claude Code: ✅