name: phone-specs-scraper description: “从GSM Arena、PhoneDB和其他网站抓取手机规格。使用场景:(1) 比较智能手机规格,(2) 研究设备特性,(3) 构建手机比较工具。”
手机规格抓取工具
从热门手机数据库网站提取详细的智能手机规格,用于比较和研究。
何时使用
- 用户要求比较手机规格
- 购买前研究设备特性
- 构建手机比较工具或数据库
- 查找详细技术规格
所需工具 / APIs
- 无需外部API
- Web抓取工具:curl、wget或Playwright/Puppeteer
- HTML解析:BeautifulSoup (Python)、cheerio (Node.js)或grep/sed (bash)
手机规格网站
主要来源
1. GSM Arena (gsmarena.com)
- 最大的手机数据库,包含详细规格
- URL模式:
https://www.gsmarena.com/[brand]_[model]-[id].php - 示例:
https://www.gsmarena.com/google_pixel_9-13242.php
2. PhoneDB (phonedb.net)
- 详细设备规格数据库
- URL模式:
https://phonedb.net/index.php?m=device&s=query&d=[device-name]
3. PhoneArena (phonearena.com)
- 手机评论和比较
- URL模式:
https://www.phonearena.com/phones/[brand]-[model]
4. MK Mobile Arena (mkmobilearena.com)
- 并排手机比较
- URL模式:
https://mkmobilearena.com/phone-compare/[phone1]-vs-[phone2]
5. TechRadar (techradar.com)
- 手机评论与规格比较
- URL模式:
https://www.techradar.com/phones/[brand]-[model]
6. DeviceBeast (devicebeast.com)
- 全面的设备规格
- URL模式:
https://devicebeast.com/phones/[brand]-[model]
7. Comparigon (comparigon.com)
- 并排比较
- URL模式:
https://comparigon.com/phones/[brand]-[model]
8. SpecsBattle (specsbattle.com)
- 详细手机比较
- URL模式:
https://specsbattle.com/phones/[brand]-[model]
技能
scrape_gsmarena_specs
使用curl和grep从GSM Arena抓取手机规格。
# Get phone specs from GSM Arena
PHONE_URL="https://www.gsmarena.com/google_pixel_9-13242.php"
# Download page and extract key specs
curl -s "$PHONE_URL" | grep -oP '(?<=<td class="ttl">)[^<]+' | head -20
# Extract specifications table
curl -s "$PHONE_URL" | grep -A2 'class="specs-cp"' | grep -oP '(?<=<td>)[^<]+'
Node.js with cheerio:
const cheerio = require('cheerio');
async function scrapeGSMArenaSpecs(url) {
const response = await fetch(url);
const html = await response.text();
const $ = cheerio.load(html);
const specs = {};
// Extract basic info
specs.name = $('h1.specs-phone-name').text().trim();
specs.released = $('.specs-cp .specs-cp-release').text().trim();
specs.weight = $('.specs-cp .specs-cp-weight').text().trim();
// Extract all specification tables
$('.specs-cp table').each((i, table) => {
const category = $(table).find('th').text().trim();
specs[category] = {};
$(table).find('tr').each((j, row) => {
const key = $(row).find('td.ttl').text().trim();
const value = $(row).find('td.nfo').text().trim();
if (key && value) {
specs[category][key] = value;
}
});
});
return specs;
}
// Usage
// scrapeGSMArenaSpecs('https://www.gsmarena.com/google_pixel_9-13242.php').then(console.log);
search_phone_specs
跨多个来源搜索手机规格。
# Search using SearXNG for phone comparison pages
QUERY="Google Pixel 9 vs Pixel 10 specs comparison"
INSTANCE="https://searx.party"
curl -s "${INSTANCE}/search?q=${QUERY}&format=json" | jq -r '.results[] | select(.url | contains("gsmarena\\|phonedb\\|mkmobilearena")) | {title, url}'
Node.js:
async function searchPhoneSpecs(phone1, phone2) {
const query = `${phone1} vs ${phone2} specs comparison`;
const searxInstances = [
'https://searx.party',
'https://searx.tiekoetter.com',
'https://searx.ninja'
];
for (const instance of searxInstances) {
try {
const params = new URLSearchParams({
q: query,
format: 'json',
language: 'en'
});
const res = await fetch(`${instance}/search?${params}`, { timeout: 10000 });
const data = await res.json();
// Filter for phone spec sites
const specSites = data.results.filter(r =>
r.url.includes('gsmarena.com') ||
r.url.includes('mkmobilearena.com') ||
r.url.includes('techradar.com') ||
r.url.includes('phonedb.net')
);
if (specSites.length > 0) {
return specSites;
}
} catch (err) {
console.warn(`Instance ${instance} failed: ${err.message}`);
}
}
throw new Error('No working search instances found');
}
// Usage
// searchPhoneSpecs('Google Pixel 9', 'Google Pixel 10').then(console.log);
compare_two_phones
比较两部手机的规格,突出显示差异。
async function comparePhones(phone1Specs, phone2Specs) {
const comparison = {
matches: [],
differences: [],
upgrades: [],
downgrades: []
};
// Compare key specs
const keyFields = [
'chipset', 'display', 'battery', 'mainCamera',
'ram', 'storage', 'weight', 'releaseDate'
];
for (const field of keyFields) {
const val1 = phone1Specs[field];
const val2 = phone2Specs[field];
if (val1 === val2) {
comparison.matches.push({ field, value: val1 });
} else {
comparison.differences.push({
field,
phone1: val1,
phone2: val2
});
// Detect upgrades (simplified logic)
if (field === 'battery' || field === 'ram' || field === 'storage') {
const num1 = parseInt(val1);
const num2 = parseInt(val2);
if (!isNaN(num1) && !isNaN(num2) && num2 > num1) {
comparison.upgrades.push({ field, from: val1, to: val2 });
}
}
}
}
return comparison;
}
// Usage example with formatted output
function formatComparison(comparison) {
let output = '## Phone Comparison\
\
';
output += '### ✅ Same Specs\
';
comparison.matches.forEach(m => {
output += `- ${m.field}: ${m.value}\
`;
});
output += '\
### 🔄 Differences\
';
comparison.differences.forEach(d => {
output += `- **${d.field}**:\
`;
output += ` - Phone 1: ${d.phone1}\
`;
output += ` - Phone 2: ${d.phone2}\
`;
});
output += '\
### ⬆️ Upgrades\
';
comparison.upgrades.forEach(u => {
output += `- ${u.field}: ${u.from} → ${u.to}\
`;
});
return output;
}
scrape_comparison_site
从如MK Mobile Arena等网站抓取预格式化的比较。
# Get comparison from MK Mobile Arena
COMPARISON_URL="https://mkmobilearena.com/phone-compare/google-pixel-9-vs-google-pixel-10"
# Extract comparison table
curl -s "$COMPARISON_URL" | grep -oP '(?<=<td[^>]*>)[^<]+' | head -50
Node.js:
async function scrapeComparisonSite(url) {
const response = await fetch(url);
const html = await response.text();
const $ = cheerio.load(html);
const comparison = {
phone1: {},
phone2: {},
differences: []
};
// Extract phone names
comparison.phone1.name = $('table tr:first-child td:nth-child(2) h3').text().trim();
comparison.phone2.name = $('table tr:first-child td:nth-child(3) h3').text().trim();
// Extract specs row by row
$('table tr').each((i, row) => {
const specName = $(row).find('td:first-child').text().trim();
const phone1Value = $(row).find('td:nth-child(2)').text().trim();
const phone2Value = $(row).find('td:nth-child(3)').text().trim();
if (specName && phone1Value && phone2Value) {
comparison.phone1[specName] = phone1Value;
comparison.phone2[specName] = phone2Value;
if (phone1Value !== phone2Value) {
comparison.differences.push({
spec: specName,
phone1: phone1Value,
phone2: phone2Value
});
}
}
});
return comparison;
}
速率限制 / 最佳实践
- 尊重robots.txt - 抓取前检查
/robots.txt - 速率限制:每秒最多1个请求
- 缓存结果至少1小时,以避免冗余请求
- 使用轮换User-Agent字符串
- 需要时使用Playwright/Puppeteer处理JavaScript渲染的网站
- 有些网站阻止抓取器 - 使用备用来源
代理提示
您可以从GSM Arena和其他来源抓取手机规格。当用户要求比较手机时:
1. 首先尝试MK Mobile Arena或类似的显示并排规格的比较网站
2. 如果不可用,从GSM Arena抓取单个手机页面
3. 提取关键规格:显示、芯片组、电池、摄像头、RAM、存储、发布日期
4. 突出显示模型之间的差异和升级
5. 格式化输出,包含清晰部分:相同规格、差异、升级
始终引用来源URL并尊重网站服务条款。
故障排除
错误:“403 Forbidden”
- 网站阻止抓取器
- 解决方案:尝试替代来源(PhoneDB、PhoneArena、TechRadar)
错误:“Empty results”
- 页面结构更改或手机未找到
- 解决方案:验证手机名称拼写和型号编号
错误:“JavaScript required”
- 网站使用客户端渲染
- 解决方案:使用Playwright/Puppeteer代替curl
速率限制
- 向网站发送过多请求
- 解决方案:添加延迟(sleep 2)在请求之间