名称: 搜索 描述: 全文搜索和搜索引擎实现。当实现搜索功能、自动完成、分面搜索、相关性调整或处理搜索索引时使用。关键词:搜索、全文搜索、Elasticsearch、OpenSearch、Meilisearch、Typesense、模糊搜索、自动完成、分面搜索、分面、倒排索引、相关性、排名、评分、分词器、分析器、搜索即输入、聚合、同义词、索引、查询、过滤、高亮、搜索UI、类型提示、建议。
搜索
概述
搜索功能是现代应用程序的关键组成部分,使用户能够快速找到相关内容。此技能涵盖Elasticsearch基础、全文搜索模式、索引策略以及高级功能如分面搜索和自动完成。
关键概念
Elasticsearch基础
Elasticsearch是基于Apache Lucene的分布式搜索和分析引擎。
核心组件:
- 索引: 具有相似特征的文档集合
- 文档: 被索引和可搜索的JSON对象
- 映射: 索引中文档的模式定义
- 分片: 用于水平扩展的索引子分区
- 副本: 用于冗余和读扩展的分片副本
基本索引操作:
// 创建带设置的索引
PUT /products
{
"settings": {
"number_of_shards": 3,
"number_of_replicas": 2,
"analysis": {
"analyzer": {
"custom_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "snowball"]
}
}
}
},
"mappings": {
"properties": {
"name": { "type": "text", "analyzer": "custom_analyzer" },
"description": { "type": "text" },
"price": { "type": "float" },
"category": { "type": "keyword" },
"created_at": { "type": "date" }
}
}
}
全文搜索模式
匹配查询 - 标准全文搜索:
GET /products/_search
{
"query": {
"match": {
"description": {
"query": "wireless bluetooth headphones",
"operator": "and",
"fuzziness": "AUTO"
}
}
}
}
多匹配查询 - 跨多个字段搜索:
GET /products/_search
{
"query": {
"multi_match": {
"query": "wireless headphones",
"fields": ["name^3", "description", "category^2"],
"type": "best_fields",
"tie_breaker": 0.3
}
}
}
布尔查询 - 组合多个条件:
GET /products/_search
{
"query": {
"bool": {
"must": [
{ "match": { "name": "headphones" } }
],
"filter": [
{ "range": { "price": { "gte": 50, "lte": 200 } } },
{ "term": { "category": "electronics" } }
],
"should": [
{ "match": { "description": "noise cancelling" } }
],
"must_not": [
{ "term": { "status": "discontinued" } }
]
}
}
}
索引策略
批量索引:
POST /_bulk
{ "index": { "_index": "products", "_id": "1" } }
{ "name": "Wireless Headphones", "price": 99.99 }
{ "index": { "_index": "products", "_id": "2" } }
{ "name": "Bluetooth Speaker", "price": 49.99 }
索引别名 - 零停机重新索引:
// 创建别名
POST /_aliases
{
"actions": [
{ "add": { "index": "products_v2", "alias": "products" } },
{ "remove": { "index": "products_v1", "alias": "products" } }
]
}
相关性调整和提升
字段提升:
GET /products/_search
{
"query": {
"multi_match": {
"query": "headphones",
"fields": ["name^5", "description^2", "tags"]
}
}
}
函数评分 - 自定义评分:
GET /products/_search
{
"query": {
"function_score": {
"query": { "match": { "name": "headphones" } },
"functions": [
{
"filter": { "term": { "featured": true } },
"weight": 2
},
{
"field_value_factor": {
"field": "popularity",
"factor": 1.2,
"modifier": "sqrt"
}
},
{
"gauss": {
"created_at": {
"origin": "now",
"scale": "30d",
"decay": 0.5
}
}
}
],
"score_mode": "multiply",
"boost_mode": "multiply"
}
}
}
分面搜索和聚合
术语聚合 - 类别分面:
GET /products/_search
{
"size": 0,
"aggs": {
"categories": {
"terms": { "field": "category", "size": 10 }
},
"price_ranges": {
"range": {
"field": "price",
"ranges": [
{ "to": 50, "key": "budget" },
{ "from": 50, "to": 100, "key": "mid-range" },
{ "from": 100, "key": "premium" }
]
}
},
"avg_price": {
"avg": { "field": "price" }
}
}
}
嵌套聚合:
GET /products/_search
{
"aggs": {
"categories": {
"terms": { "field": "category" },
"aggs": {
"avg_price": { "avg": { "field": "price" } },
"top_products": {
"top_hits": { "size": 3, "_source": ["name", "price"] }
}
}
}
}
}
搜索即输入和自动完成
完成建议器设置:
PUT /products
{
"mappings": {
"properties": {
"name_suggest": {
"type": "completion",
"contexts": [
{ "name": "category", "type": "category" }
]
}
}
}
}
自动完成查询:
GET /products/_search
{
"suggest": {
"product_suggest": {
"prefix": "wire",
"completion": {
"field": "name_suggest",
"size": 5,
"fuzzy": { "fuzziness": 1 },
"contexts": {
"category": ["electronics"]
}
}
}
}
}
边缘N-gram分析器 - 替代方法:
PUT /products
{
"settings": {
"analysis": {
"filter": {
"edge_ngram_filter": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 20
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "edge_ngram_filter"]
},
"autocomplete_search": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase"]
}
}
}
},
"mappings": {
"properties": {
"name": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
同义词和分析器
同义词配置:
PUT /products
{
"settings": {
"analysis": {
"filter": {
"synonym_filter": {
"type": "synonym",
"synonyms": [
"laptop, notebook, portable computer",
"phone, mobile, cellphone, smartphone",
"tv, television, telly"
]
},
"synonym_graph_filter": {
"type": "synonym_graph",
"synonyms_path": "synonyms.txt"
}
},
"analyzer": {
"synonym_analyzer": {
"tokenizer": "standard",
"filter": ["lowercase", "synonym_filter"]
}
}
}
}
}
带多个过滤器的自定义分析器:
PUT /products
{
"settings": {
"analysis": {
"char_filter": {
"html_strip": { "type": "html_strip" }
},
"filter": {
"english_stop": { "type": "stop", "stopwords": "_english_" },
"english_stemmer": { "type": "stemmer", "language": "english" }
},
"analyzer": {
"english_analyzer": {
"type": "custom",
"char_filter": ["html_strip"],
"tokenizer": "standard",
"filter": ["lowercase", "english_stop", "english_stemmer"]
}
}
}
}
}
Elasticsearch模式
连接管理:
// 单例客户端模式
class ElasticsearchClient {
static instance = null;
static getInstance() {
if (!this.instance) {
this.instance = new Client({
node: process.env.ES_URL,
auth: {
apiKey: process.env.ES_API_KEY,
},
maxRetries: 3,
requestTimeout: 30000,
});
}
return this.instance;
}
}
索引模板 - 时间序列索引的一致映射:
PUT /_index_template/logs_template
{
"index_patterns": ["logs-*"],
"template": {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 1
},
"mappings": {
"properties": {
"@timestamp": { "type": "date" },
"message": { "type": "text" },
"level": { "type": "keyword" }
}
}
}
}
重新索引模式 - 模式迁移:
POST /_reindex
{
"source": { "index": "products_v1" },
"dest": { "index": "products_v2" },
"script": {
"source": "ctx._source.category = ctx._source.category.toLowerCase()"
}
}
搜索UI模式
防抖搜索输入:
import { useState, useEffect } from "react";
function SearchBar({ onSearch }) {
const [query, setQuery] = useState("");
useEffect(() => {
const timer = setTimeout(() => {
if (query.length >= 2) {
onSearch(query);
}
}, 300);
return () => clearTimeout(timer);
}, [query, onSearch]);
return (
<input
type="text"
value={query}
onChange={(e) => setQuery(e.target.value)}
placeholder="Search..."
/>
);
}
分面搜索组件:
function FacetedSearch({ aggregations, selectedFilters, onFilterChange }) {
return (
<div className="facets">
<div className="facet-group">
<h3>Category</h3>
{aggregations.categories.buckets.map((bucket) => (
<label key={bucket.key}>
<input
type="checkbox"
checked={selectedFilters.category?.includes(bucket.key)}
onChange={() => onFilterChange("category", bucket.key)}
/>
{bucket.key} ({bucket.doc_count})
</label>
))}
</div>
<div className="facet-group">
<h3>Price Range</h3>
{aggregations.price_ranges.buckets.map((bucket) => (
<label key={bucket.key}>
<input
type="radio"
name="price_range"
checked={selectedFilters.priceRange === bucket.key}
onChange={() => onFilterChange("priceRange", bucket.key)}
/>
{bucket.key} ({bucket.doc_count})
</label>
))}
</div>
</div>
);
}
带高亮的搜索结果:
function SearchResult({ hit }) {
const getHighlightedText = (text, highlights) => {
if (!highlights) return text;
return { __html: highlights.join("...") };
};
return (
<div className="search-result">
<h3
dangerouslySetInnerHTML={getHighlightedText(
hit.name,
hit.highlight?.name,
)}
/>
<p
dangerouslySetInnerHTML={getHighlightedText(
hit.description,
hit.highlight?.description,
)}
/>
<span className="score">Score: {hit._score.toFixed(2)}</span>
</div>
);
}
相关性调整策略
测试相关性:
class RelevanceTest {
async testQuery(query, expectedTopResults) {
const results = await this.search(query);
const topIds = results.hits.slice(0, 3).map((h) => h._id);
console.log(`Query: "${query}"`);
console.log(`Expected: ${expectedTopResults.join(", ")}`);
console.log(`Actual: ${topIds.join(", ")}`);
const precision =
topIds.filter((id) => expectedTopResults.includes(id)).length /
topIds.length;
return { precision, topIds };
}
}
// 测试用例
const tests = [
{ query: "wireless headphones", expected: ["prod-123", "prod-456"] },
{ query: "bluetooth speaker", expected: ["prod-789", "prod-012"] },
];
多字段评分策略:
GET /products/_search
{
"query": {
"multi_match": {
"query": "wireless headphones",
"fields": [
"exact_name^10",
"name^5",
"brand^3",
"description^2",
"tags"
],
"type": "cross_fields",
"operator": "and"
}
}
}
新近度提升模式:
GET /articles/_search
{
"query": {
"function_score": {
"query": { "match": { "content": "elasticsearch" } },
"functions": [
{
"exp": {
"published_at": {
"origin": "now",
"scale": "7d",
"offset": "1d",
"decay": 0.5
}
}
}
]
}
}
}
流行度 + 相关性组合:
GET /products/_search
{
"query": {
"function_score": {
"query": { "match": { "name": "laptop" } },
"functions": [
{
"field_value_factor": {
"field": "sales_count",
"modifier": "log1p",
"factor": 0.1
}
},
{
"field_value_factor": {
"field": "rating",
"modifier": "none",
"factor": 2
}
}
],
"score_mode": "sum",
"boost_mode": "multiply"
}
}
}
最佳实践
索引
- 对大块数据导入使用批量操作
- 实现索引别名以实现零停机重新索引
- 根据数据大小选择适当的分片数量
- 在生产中使用显式映射而非动态映射
查询性能
- 对精确匹配使用
filter上下文(缓存,更快) - 仅当评分重要时使用
must上下文 - 限制结果大小并使用分页
- 避免查询中的前导通配符
相关性
- 用代表性查询测试相关性
- 使用字段提升以优先重要字段
- 为业务逻辑(如流行度、新近度)实现function_score
- 考虑对OR式查询使用
dis_max
自动完成
- 对简单前缀匹配使用完成建议器
- 对更灵活的匹配使用边缘N-gram
- 在客户端实现防抖(200-300毫秒)
- 返回带高亮的建议
模式设计
- 对精确匹配和聚合使用
keyword类型 - 对全文搜索使用
text类型 - 考虑对两种用例使用多字段
- 谨慎使用嵌套对象(影响性能)
示例
完整搜索实现(Node.js)
const { Client } = require("@elastic/elasticsearch");
class SearchService {
constructor() {
this.client = new Client({ node: "http://localhost:9200" });
}
async search(query, filters = {}, page = 1, pageSize = 20) {
const must = [];
const filter = [];
if (query) {
must.push({
multi_match: {
query,
fields: ["name^3", "description", "tags^2"],
type: "best_fields",
fuzziness: "AUTO",
},
});
}
if (filters.category) {
filter.push({ term: { category: filters.category } });
}
if (filters.priceMin || filters.priceMax) {
filter.push({
range: {
price: {
...(filters.priceMin && { gte: filters.priceMin }),
...(filters.priceMax && { lte: filters.priceMax }),
},
},
});
}
const response = await this.client.search({
index: "products",
body: {
from: (page - 1) * pageSize,
size: pageSize,
query: {
bool: {
must: must.length ? must : [{ match_all: {} }],
filter,
},
},
aggs: {
categories: { terms: { field: "category", size: 20 } },
price_stats: { stats: { field: "price" } },
},
highlight: {
fields: {
name: {},
description: { fragment_size: 150 },
},
},
},
});
return {
hits: response.hits.hits.map((hit) => ({
...hit._source,
_score: hit._score,
highlight: hit.highlight,
})),
total: response.hits.total.value,
aggregations: response.aggregations,
};
}
async autocomplete(prefix, limit = 5) {
const response = await this.client.search({
index: "products",
body: {
suggest: {
suggestions: {
prefix,
completion: {
field: "name_suggest",
size: limit,
fuzzy: { fuzziness: 1 },
},
},
},
},
});
return response.suggest.suggestions[0].options.map((opt) => ({
text: opt.text,
score: opt._score,
}));
}
}
Python实现
from elasticsearch import Elasticsearch, helpers
from typing import Dict, List, Optional
class SearchService:
def __init__(self, hosts: List[str] = ['localhost:9200']):
self.es = Elasticsearch(hosts)
def bulk_index(self, index: str, documents: List[Dict]):
actions = [
{
'_index': index,
'_id': doc.get('id'),
'_source': doc
}
for doc in documents
]
helpers.bulk(self.es, actions)
def search(
self,
index: str,
query: str,
filters: Optional[Dict] = None,
page: int = 1,
size: int = 20
) -> Dict:
body = {
'from': (page - 1) * size,
'size': size,
'query': {
'bool': {
'must': [{
'multi_match': {
'query': query,
'fields': ['name^3', 'description'],
'fuzziness': 'AUTO'
}
}] if query else [{'match_all': {}}],
'filter': self._build_filters(filters or {})
}
},
'aggs': {
'categories': {'terms': {'field': 'category'}},
'price_ranges': {
'range': {
'field': 'price',
'ranges': [
{'to': 50},
{'from': 50, 'to': 100},
{'from': 100}
]
}
}
}
}
return self.es.search(index=index, body=body)
def _build_filters(self, filters: Dict) -> List[Dict]:
result = []
if 'category' in filters:
result.append({'term': {'category': filters['category']}})
if 'price_min' in filters or 'price_max' in filters:
price_range = {}
if 'price_min' in filters:
price_range['gte'] = filters['price_min']
if 'price_max' in filters:
price_range['lte'] = filters['price_max']
result.append({'range': {'price': price_range}})
return result