名称: 网络存档 描述: 网页存档和从缓存/删除源检索。用于访问不可用页面、保存网页内容、创建法律证据档案或构建冗余存档工作流。涵盖Wayback Machine、Archive.today、ArchiveBox和证据保存工具。
网络存档方法论
用于新闻、研究和法律目的访问不可访问网页和保存网页内容的模式。
存档服务层次结构
按此顺序尝试服务以获得最大覆盖:
┌─────────────────────────────────────────────────────────────────┐
│ 存档检索级联 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 1. Wayback Machine (archive.org) │
│ └─ 916B+ 页面,历史深度,API 访问 │
│ ↓ 未找到 │
│ 2. Archive.today (archive.is/archive.ph) │
│ └─ 按需快照,付费墙绕过 │
│ ↓ 未找到 │
│ 3. Google Cache (有限可用性) │
│ └─ 最近页面,搜索: cache:url │
│ ↓ 未找到 │
│ 4. Bing Cache │
│ └─ 在搜索结果中点击下拉箭头 │
│ ↓ 未找到 │
│ 5. Memento Time Travel (聚合器) │
│ └─ 同时搜索多个存档 │
│ │
└─────────────────────────────────────────────────────────────────┘
Wayback Machine API
检查URL是否已存档
import requests
from typing import Optional
from datetime import datetime
def check_wayback_availability(url: str) -> Optional[dict]:
"""检查URL是否存在于Wayback Machine。"""
api_url = f"http://archive.org/wayback/available?url={url}"
try:
response = requests.get(api_url, timeout=10)
data = response.json()
if data.get('archived_snapshots', {}).get('closest'):
snapshot = data['archived_snapshots']['closest']
return {
'available': snapshot.get('available', False),
'url': snapshot.get('url'),
'timestamp': snapshot.get('timestamp'),
'status': snapshot.get('status')
}
return None
except Exception as e:
return None
def get_wayback_url(url: str, timestamp: str = None) -> str:
"""生成Wayback Machine的URL。
参数:
url: 要检索的原始URL
timestamp: 可选的YYYYMMDDHHMMSS格式,或None表示最新
"""
if timestamp:
return f"https://web.archive.org/web/{timestamp}/{url}"
return f"https://web.archive.org/web/{url}"
保存页面到Wayback Machine
def save_to_wayback(url: str) -> Optional[str]:
"""请求Wayback Machine存档URL。
返回存档URL如果成功。
"""
save_url = f"https://web.archive.org/save/{url}"
headers = {
'User-Agent': 'Mozilla/5.0 (research-archiver)'
}
try:
response = requests.get(save_url, headers=headers, timeout=60)
# 检查存档是否成功
if response.status_code == 200:
# 存档URL在Content-Location头部
archived_url = response.headers.get('Content-Location')
if archived_url:
return f"https://web.archive.org{archived_url}"
return response.url
return None
except Exception:
return None
CDX API用于历史快照
def get_all_snapshots(url: str, limit: int = 100) -> list[dict]:
"""使用CDX API获取URL的所有存档快照。
返回带时间戳和状态码的快照列表。
"""
cdx_url = "http://web.archive.org/cdx/search/cdx"
params = {
'url': url,
'output': 'json',
'limit': limit,
'fl': 'timestamp,original,statuscode,digest,length'
}
try:
response = requests.get(cdx_url, params=params, timeout=30)
data = response.json()
if len(data) < 2: # 第一行是头部
return []
headers = data[0]
snapshots = []
for row in data[1:]:
snapshot = dict(zip(headers, row))
snapshot['wayback_url'] = (
f"https://web.archive.org/web/{snapshot['timestamp']}/{snapshot['original']}"
)
snapshots.append(snapshot)
return snapshots
except Exception:
return []
Archive.today集成
保存到Archive.today
import requests
from urllib.parse import quote
def save_to_archive_today(url: str) -> Optional[str]:
"""提交URL到Archive.today进行存档。
注意: Archive.today有速率限制和CAPTCHA要求。
此函数适用于基本存档,但高量使用可能需要手动干预。
"""
submit_url = "https://archive.today/submit/"
data = {
'url': url,
'anyway': '1' # 即使有最近快照也存档
}
try:
response = requests.post(submit_url, data=data, timeout=60)
# Archive.today在响应中返回存档URL
if response.status_code == 200:
return response.url
return None
except Exception:
return None
def search_archive_today(url: str) -> Optional[str]:
"""搜索现有的Archive.today快照。"""
search_url = f"https://archive.today/{quote(url, safe='')}"
try:
response = requests.get(search_url, timeout=30, allow_redirects=True)
if response.status_code == 200 and 'archive.today' in response.url:
return response.url
return None
except Exception:
return None
多存档冗余
存档级联以实现最大保存
from dataclasses import dataclass
from typing import Optional, List
from concurrent.futures import ThreadPoolExecutor, as_completed
@dataclass
class ArchiveResult:
service: str
url: str
archived_url: Optional[str]
success: bool
error: Optional[str] = None
class MultiArchiver:
"""将URL存档到多个服务以实现冗余。"""
def __init__(self):
self.services = [
('wayback', self._save_wayback),
('archive_today', self._save_archive_today),
('perma_cc', self._save_perma), # 需要API密钥
]
def archive_url(self, url: str, parallel: bool = True) -> List[ArchiveResult]:
"""存档URL到所有服务。
参数:
url: 要存档的URL
parallel: 如果为True,同时存档到所有服务
"""
results = []
if parallel:
with ThreadPoolExecutor(max_workers=3) as executor:
futures = {
executor.submit(save_func, url): name
for name, save_func in self.services
}
for future in as_completed(futures):
service = futures[future]
try:
archived_url = future.result()
results.append(ArchiveResult(
service=service,
url=url,
archived_url=archived_url,
success=archived_url is not None
))
except Exception as e:
results.append(ArchiveResult(
service=service,
url=url,
archived_url=None,
success=False,
error=str(e)
))
else:
for name, save_func in self.services:
try:
archived_url = save_func(url)
results.append(ArchiveResult(
service=name,
url=url,
archived_url=archived_url,
success=archived_url is not None
))
except Exception as e:
results.append(ArchiveResult(
service=name,
url=url,
archived_url=None,
success=False,
error=str(e)
))
return results
def _save_wayback(self, url: str) -> Optional[str]:
return save_to_wayback(url)
def _save_archive_today(self, url: str) -> Optional[str]:
return save_to_archive_today(url)
def _save_perma(self, url: str) -> Optional[str]:
# 需要Perma.cc API密钥
# 实现取决于API凭证
return None
自托管存档与ArchiveBox
ArchiveBox设置
# 安装ArchiveBox
pip install archivebox
# 或使用Docker
docker pull archivebox/archivebox
# 初始化存档目录
mkdir ~/web-archives && cd ~/web-archives
archivebox init
# 添加URL到存档
archivebox add "https://example.com/article"
# 从文件添加多个URL
archivebox add --depth=0 < urls.txt
# 安排定期存档
archivebox schedule --every=day --depth=1 "https://example.com/feed.rss"
ArchiveBox Python集成
import subprocess
from pathlib import Path
from typing import List, Optional
class ArchiveBoxManager:
"""管理本地ArchiveBox实例。"""
def __init__(self, archive_dir: Path):
self.archive_dir = archive_dir
self._ensure_initialized()
def _ensure_initialized(self):
"""如果需要,初始化ArchiveBox。"""
if not (self.archive_dir / 'index.sqlite3').exists():
subprocess.run(
['archivebox', 'init'],
cwd=self.archive_dir,
check=True
)
def add_url(self, url: str, depth: int = 0) -> bool:
"""存档单个URL。
参数:
url: 要存档的URL
depth: 0表示单页,1表示跟随链接一级深度
"""
result = subprocess.run(
['archivebox', 'add', f'--depth={depth}', url],
cwd=self.archive_dir,
capture_output=True,
text=True
)
return result.returncode == 0
def add_urls_from_file(self, filepath: Path) -> bool:
"""从文本文件存档URL(每行一个)。"""
with open(filepath) as f:
result = subprocess.run(
['archivebox', 'add', '--depth=0'],
cwd=self.archive_dir,
stdin=f,
capture_output=True
)
return result.returncode == 0
def search(self, query: str) -> List[dict]:
"""搜索存档内容。"""
result = subprocess.run(
['archivebox', 'list', '--filter-type=search', query],
cwd=self.archive_dir,
capture_output=True,
text=True
)
# 解析输出...
return []
法律证据保存
保管链文档化
import hashlib
from datetime import datetime
from dataclasses import dataclass, asdict
import json
@dataclass
class EvidenceRecord:
"""法律可辩护的证据记录。"""
# 内容标识
original_url: str
archived_urls: List[str] # 多个存档副本
content_hash_sha256: str
# 时间戳
capture_time_utc: str
first_observed: str
# 元数据
page_title: str
captured_by: str
capture_method: str
tool_versions: dict
# 保管链
custody_log: List[dict] # 谁何时访问
def add_custody_entry(self, accessor: str, action: str, notes: str = ""):
"""记录对证据的访问。"""
self.custody_log.append({
'timestamp': datetime.utcnow().isoformat(),
'accessor': accessor,
'action': action,
'notes': notes
})
def to_json(self) -> str:
return json.dumps(asdict(self), indent=2)
@classmethod
def from_capture(cls, url: str, content: bytes, captured_by: str):
"""从捕获内容创建证据记录。"""
return cls(
original_url=url,
archived_urls=[],
content_hash_sha256=hashlib.sha256(content).hexdigest(),
capture_time_utc=datetime.utcnow().isoformat(),
first_observed=datetime.utcnow().isoformat(),
page_title="",
captured_by=captured_by,
capture_method="automated_capture",
tool_versions={
'archiver': '1.0.0',
'python': '3.11'
},
custody_log=[]
)
def capture_as_evidence(url: str, captured_by: str) -> EvidenceRecord:
"""捕获URL并完整文档化证据链。"""
# 捕获内容
response = requests.get(url)
content = response.content
# 创建证据记录
record = EvidenceRecord.from_capture(url, content, captured_by)
record.page_title = extract_title(content)
# 存档到多个服务
archiver = MultiArchiver()
results = archiver.archive_url(url)
for result in results:
if result.success:
record.archived_urls.append(result.archived_url)
# 记录初始捕获
record.add_custody_entry(
captured_by,
'initial_capture',
f'从{url}捕获,存档到{len(record.archived_urls)}个服务'
)
return record
Perma.cc用于法律引用
import requests
from typing import Optional
class PermaCC:
"""Perma.cc API客户端用于法律级存档。
需要perma.cc的API密钥(免费有限使用)。
被美国法院和法律专业人士使用。
"""
def __init__(self, api_key: str):
self.api_key = api_key
self.base_url = "https://api.perma.cc/v1"
self.headers = {
'Authorization': f'ApiKey {api_key}',
'Content-Type': 'application/json'
}
def create_archive(self, url: str, folder_id: int = None) -> Optional[dict]:
"""创建新的Perma.cc存档。
返回带guid、creation_timestamp和captures的字典。
"""
data = {'url': url}
if folder_id:
data['folder'] = folder_id
try:
response = requests.post(
f"{self.base_url}/archives/",
json=data,
headers=self.headers,
timeout=60
)
if response.status_code == 201:
result = response.json()
return {
'guid': result['guid'],
'url': f"https://perma.cc/{result['guid']}",
'creation_timestamp': result['creation_timestamp'],
'title': result.get('title', '')
}
return None
except Exception:
return None
def get_archive(self, guid: str) -> Optional[dict]:
"""通过GUID检索存档元数据。"""
try:
response = requests.get(
f"{self.base_url}/archives/{guid}/",
headers=self.headers,
timeout=30
)
return response.json() if response.status_code == 200 else None
except Exception:
return None
浏览器扩展和书签工具
快速存档书签工具
// 保存到Wayback Machine - 添加为书签
javascript:(function(){
var url = location.href;
window.open('https://web.archive.org/save/' + url, '_blank');
})();
// 保存到Archive.today
javascript:(function(){
var url = location.href;
window.open('https://archive.today/?run=1&url=' + encodeURIComponent(url), '_blank');
})();
// 检查所有存档(Memento)
javascript:(function(){
var url = location.href;
window.open('http://timetravel.mementoweb.org/list/0/' + url, '_blank');
})();
复活死页面的书签工具
// 为死页面尝试多个存档
javascript:(function(){
var url = location.href;
var archives = [
'https://web.archive.org/web/*/' + url,
'https://archive.today/' + encodeURIComponent(url),
'https://webcache.googleusercontent.com/search?q=cache:' + url,
'http://timetravel.mementoweb.org/list/0/' + url
];
archives.forEach(function(a){ window.open(a, '_blank'); });
})();
存档服务比较
| 服务 | 最佳用途 | API | 删除 | 最大大小 |
|---|---|---|---|---|
| Wayback Machine | 历史研究 | 是(免费) | 按请求 | 无限 |
| Archive.today | 付费墙绕过,快速保存 | 否 | 从不 | 50MB |
| Perma.cc | 法律引用 | 是(免费层) | 由创建者 | 标准页面 |
| ArchiveBox | 自托管,隐私 | 本地 | 从不 | 磁盘空间 |
| Conifer | 交互式内容 | 是 | 由创建者 | 5GB免费 |
错误处理和回退
from enum import Enum
from typing import Optional
class ArchiveError(Enum):
NOT_FOUND = "未找到存档"
RATE_LIMITED = "服务速率限制"
BLOCKED = "URL被阻止存档"
TIMEOUT = "请求超时"
SERVICE_DOWN = "存档服务不可用"
def get_archived_page(url: str) -> tuple[Optional[str], Optional[ArchiveError]]:
"""使用适当的错误处理尝试所有存档服务。"""
# 1. 先尝试Wayback Machine
try:
result = check_wayback_availability(url)
if result and result.get('available'):
return result['url'], None
except requests.Timeout:
pass # 尝试下一个服务
except Exception:
pass
# 2. 尝试Archive.today
try:
result = search_archive_today(url)
if result:
return result, None
except Exception:
pass
# 3. 尝试Memento聚合器
try:
memento_url = f"http://timetravel.mementoweb.org/api/json/0/{url}"
response = requests.get(memento_url, timeout=30)
data = response.json()
if data.get('mementos', {}).get('closest'):
return data['mementos']['closest']['uri'][0], None
except Exception:
pass
return None, ArchiveError.NOT_FOUND
最佳实践
何时存档
- 发布前: 存档所有引用的来源
- 突发新闻: 立即存档,内容可能更改或消失
- 法律事务: 创建带时间戳的证据并多个存档
- 研究: 为可重现性存档主要来源
- 社交媒体: 存档帖子以防被删除
存档冗余
始终存档到至少两个服务:
def ensure_archived(url: str) -> bool:
"""确保URL至少存档到2个服务。"""
archiver = MultiArchiver()
results = archiver.archive_url(url)
successful = [r for r in results if r.success]
return len(successful) >= 2
速率限制和伦理
- 尊重
robots.txt进行批量存档 - 请求间添加延迟(至少1-3秒)
- 未经同意不要存档个人/私人页面
- 可用时使用API密钥以获得更好的速率限制
- 缓存结果以避免冗余请求