name: 自然语言处理 description: 使用transformers库构建NLP应用,BERT,GPT,文本分类,命名实体识别,情感分析
自然语言处理
概览
这项技能提供了使用现代transformers,BERT,GPT和经典NLP技术构建NLP应用的全面工具,用于文本分类,命名实体识别,情感分析等。
使用场景
- 构建用于情感分析,主题分类或意图检测的文本分类系统
- 从非结构化文本中提取命名实体(人名,地点,组织)
- 实施机器翻译,文本摘要或问答系统
- 处理和分析大量文本数据以获取洞察
- 创建聊天机器人,虚拟助手或会话AI应用
- 为特定领域的NLP任务微调预训练的transformer模型
NLP核心任务
- 文本分类:情感,主题,意图分类
- 命名实体识别:识别人名,地点,组织
- 机器翻译:文本之间的语言翻译
- 文本摘要:提取关键信息
- 问答:在文档中找到答案
- 文本生成:生成连贯的文本
流行模型和库
- Transformers:BERT,GPT,RoBERTa,T5
- spaCy:工业级NLP管道
- NLTK:经典NLP工具包
- Hugging Face:预训练模型中心
- PyTorch/TensorFlow:深度学习框架
Python实现
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import torch
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
AutoModelForTokenClassification, pipeline,
TextClassificationPipeline)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')
# 下载所需的NLTK资源
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
print("=== 1. 文本预处理 ===")
def preprocess_text(text, remove_stopwords=True, lemmatize=True):
"""完整的文本预处理管道"""
# 小写化
text = text.lower()
# 移除特殊字符和数字
text = re.sub(r'[^a-zA-Z\s]', '', text)
# 分词
tokens = word_tokenize(text)
# 移除停用词
if remove_stopwords:
stop_words = set(stopwords.words('english'))
tokens = [t for t in tokens if t not in stop_words]
# 词形还原
if lemmatize:
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(t) for t in tokens]
return tokens, ' '.join(tokens)
sample_text = "The quick brown foxes are jumping over the lazy dogs! Amazing performance."
tokens, processed = preprocess_text(sample_text)
print(f"Original: {sample_text}")
print(f"Processed: {processed}")
print(f"Tokens: {tokens}
")
# 2. 使用sklearn进行文本分类
print("=== 2. 传统文本分类 ===")
# 示例数据
texts = [
"I love this product, it's amazing!",
"This movie is fantastic and entertaining.",
"Best purchase ever, highly recommended.",
"Terrible quality, very disappointed.",
"Worst experience, waste of money.",
"Horrible service and poor quality.",
"The food was delicious and fresh.",
"Great atmosphere and friendly staff.",
"Bad weather today, very gloomy.",
"The book was boring and uninteresting."
]
labels = [1, 1, 1, 0, 0, 0, 1, 1, 0, 0] # 1: positive, 0: negative
# TF-IDF向量化
tfidf = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(texts)
# 训练分类器
clf = MultinomialNB()
clf.fit(X_tfidf, labels)
# 评估
predictions = clf.predict(X_tfidf)
print(f"Accuracy: {accuracy_score(labels, predictions):.4f}")
print(f"Precision: {precision_score(labels, predictions):.4f}")
print(f"Recall: {recall_score(labels, predictions):.4f}")
print(f"F1: {f1_score(labels, predictions):.4f}
")
# 3. 基于Transformer的文本分类
print("=== 3. 基于Transformer的分类 ===")
try:
# 使用Hugging Face transformers进行情感分析
sentiment_pipeline = pipeline(
"sentiment-analysis",
model="distilbert-base-uncased-finetuned-sst-2-english"
)
test_sentences = [
"This is a wonderful movie!",
"I absolutely hate this product.",
"It's okay, nothing special.",
"Amazing quality and fast delivery!"
]
print("Sentiment Analysis Results:")
for sentence in test_sentences:
result = sentiment_pipeline(sentence)
print(f" Text: {sentence}")
print(f" Sentiment: {result[0]['label']}, Score: {result[0]['score']:.4f}
")
except Exception as e:
print(f"Transformer model not available: {str(e)}
")
# 4. 命名实体识别 (NER)
print("=== 4. 命名实体识别 ===")
try:
ner_pipeline = pipeline(
"ner",
model="distilbert-base-uncased",
aggregation_strategy="simple"
)
text = "Apple Inc. was founded by Steve Jobs in Cupertino, California."
entities = ner_pipeline(text)
print(f"Text: {text}")
print("Entities:")
for entity in entities:
print(f" {entity['word']}: {entity['entity_group']} (score: {entity['score']:.4f})")
except Exception as e:
print(f"NER model not available: {str(e)}
")
# 5. 词嵌入和相似度
print("
=== 5. 词嵌入和相似度 ===")
from sklearn.metrics.pairwise import cosine_similarity
# 简单的词袋模型嵌入
vectorizer = CountVectorizer(max_features=50)
docs = [
"machine learning is great",
"deep learning uses neural networks",
"machine learning and deep learning"
]
embeddings = vectorizer.fit_transform(docs).toarray()
# 计算相似度
similarity_matrix = cosine_similarity(embeddings)
print("Document Similarity Matrix:")
print(pd.DataFrame(similarity_matrix, columns=[f"Doc{i}" for i in range(len(docs))],
index=[f"Doc{i}" for i in range(len(docs))]).round(3))
# 6. 分词和词汇表
print("
=== 6. 分词分析 ===")
corpus = " ".join(texts)
tokens, _ = preprocess_text(corpus)
# 词汇表
vocab = Counter(tokens)
print(f"Vocabulary size: {len(vocab)}")
print("Top 10 most common words:")
for word, count in vocab.most_common(10):
print(f" {word}: {count}")
# 7. 高级Transformer管道
print("
=== 7. 高级NLP任务 ===")
try:
# 零样本分类
zero_shot_pipeline = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli"
)
sequence = "Apple is discussing the possibility of acquiring startup for 1 billion dollars"
candidate_labels = ["business", "sports", "technology", "politics"]
result = zero_shot_pipeline(sequence, candidate_labels)
print("Zero-shot Classification Results:")
for label, score in zip(result['labels'], result['scores']):
print(f" {label}: {score:.4f}")
except Exception as e:
print(f"Advanced pipeline not available: {str(e)}
")
# 8. 文本统计和分析
print("
=== 8. 文本统计 ===")
sample_texts = [
"Natural language processing is fascinating.",
"Machine learning enables artificial intelligence.",
"Deep learning revolutionizes computer vision."
]
stats_data = []
for text in sample_texts:
words = text.split()
chars = len(text)
avg_word_len = np.mean([len(w) for w in words])
stats_data.append({
'Text': text[:40] + '...' if len(text) > 40 else text,
'Words': len(words),
'Characters': chars,
'Avg Word Len': avg_word_len
})
stats_df = pd.DataFrame(stats_data)
print(stats_df.to_string(index=False))
# 9. 可视化
print("
=== 9. NLP可视化 ===")
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 词频
word_freq = vocab.most_common(15)
words, freqs = zip(*word_freq)
axes[0, 0].barh(range(len(words)), freqs, color='steelblue')
axes[0, 0].set_yticks(range(len(words)))
axes[0, 0].set_yticklabels(words)
axes[0, 0].set_xlabel('Frequency')
axes[0, 0].set_title('Top 15 Most Frequent Words')
axes[0, 0].invert_yaxis()
# 情感分布
sentiments = ['Positive', 'Negative', 'Positive', 'Negative', 'Positive']
sentiment_counts = Counter(sentiments)
axes[0, 1].pie(sentiment_counts.values(), labels=sentiment_counts.keys(),
autopct='%1.1f%%', colors=['green', 'red'])
axes[0, 1].set_title('Sentiment Distribution')
# 文档相似度热图
im = axes[1, 0].imshow(similarity_matrix, cmap='YlOrRd', aspect='auto')
axes[1, 0].set_xticks(range(len(docs)))
axes[1, 0].set_yticks(range(len(docs)))
axes[1, 0].set_xticklabels([f'Doc{i}' for i in range(len(docs))])
axes[1, 0].set_yticklabels([f'Doc{i}' for i in range(len(docs))])
axes[1, 0].set_title('Document Similarity Heatmap')
plt.colorbar(im, ax=axes[1, 0])
# 文本长度分布
text_lengths = [len(t.split()) for t in texts]
axes[1, 1].hist(text_lengths, bins=5, color='coral', edgecolor='black')
axes[1, 1].set_xlabel('Number of Words')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Text Length Distribution')
axes[1, 1].grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('nlp_analysis.png', dpi=100, bbox_inches='tight')
print("
NLP可视化保存为 'nlp_analysis.png'")
# 10. 总结
print("
=== NLP总结 ===")
print(f"Texts processed: {len(texts)}")
print(f"Unique vocabulary: {len(vocab)} words")
print(f"Average text length: {np.mean([len(t.split()) for t in texts]):.2f} words")
print(f"Classification accuracy: {accuracy_score(labels, predictions):.4f}")
print("
自然语言处理设置完成!")
常见NLP任务和模型
- 分类:DistilBERT,RoBERTa,ELECTRA
- NER:BioBERT,SciBERT,spaCy模型
- 翻译:MarianMT,M2M-100
- 摘要:BART,Pegasus,T5
- QA:BERT,RoBERTa,DeBERTa
文本预处理管道
- 小写化和清洗
- 分词
- 停用词移除
- 词形还原/词干提取
- 向量化
最佳实践
- 可用时使用预训练模型
- 在特定任务数据上微调
- 处理词汇表外的词
- 批量处理以提高效率
- 监控模型中的偏见
交付物
- 训练好的NLP模型
- 文本分类结果
- 提取的命名实体
- 性能指标
- 可视化仪表板
- 推理API