Skip to content

Commit 5ebfd66

Browse files
committed
feat: 优化全局搜索功能 - 提升搜索准确度与排序质量
1. SQL搜索添加相关性排序:CASE WHEN标题匹配THEN 2 ELSE 1排序,标题匹配结果优先展示 2. TF-IDF算法修正:COUNT改为SUM计算总词频;每个词独立计算IDF;添加查询词覆盖率加成 3. 分词过滤优化:过滤单字符标点/特殊字符避免匹配大量无关文档;原始关键词加入搜索词列表 4. 批量加载优化:将逐条DB查询改为批量IN查询(Document/Blog/Book/Member),大幅减少DB往返 5. 排序算法优化:冒泡排序改为sort.Slice,时间复杂度从O(n²)降到O(n log n) 6. 标题加权方式重构:移除10次标题重复拼接,改为搜索结果层1.5x boost系数,避免索引数据污染 7. 精确匹配boost(5x):文档内容包含原始搜索关键词时大幅提分,解决专有名词排名靠后问题 8. 词长权重:长词(更具体)的TF-IDF贡献按log2(1+len)加权,专有名词匹配权重高于短词 9. 对数TF替代原始TF:公式从wordCount/totalWordsInDoc改为1+log(wordCount+1),解决空/短文档排名异常高的问题 10. 分页时机修复:标题boost和精确匹配boost在分页截取之前完成,确保高相关性文档不被遗漏 11. 总文档数查询优化:CONCAT+DISTINCT改为子查询,可利用联合索引提升性能 12. 新增mindoc reindex CLI命令:支持全量重建倒排索引,清空旧数据后按批次重建
1 parent 0e8476b commit 5ebfd66

File tree

7 files changed

+427
-186
lines changed

7 files changed

+427
-186
lines changed

commands/command.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,12 @@ func RegisterCommand() {
237237
} else if len(os.Args) >= 2 && os.Args[1] == "update" {
238238
Update()
239239
os.Exit(0)
240+
} else if len(os.Args) >= 2 && os.Args[1] == "reindex" {
241+
ResolveCommand(os.Args[2:])
242+
fmt.Println("开始全量重建倒排索引...")
243+
models.RebuildAllIndexes()
244+
fmt.Println("倒排索引重建完成")
245+
os.Exit(0)
240246
}
241247

242248
}

controllers/SearchController.go

Lines changed: 230 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
package controllers
22

33
import (
4+
"sort"
45
"strconv"
56
"strings"
67

8+
"github.com/beego/beego/v2/client/orm"
79
"github.com/beego/beego/v2/core/logs"
810
"github.com/beego/i18n"
911
"github.com/mindoc-org/mindoc/conf"
@@ -62,16 +64,116 @@ func PerformSearchV2Raw(keyword string, pageIndex, pageSize int, memberId int) (
6264
words = []string{keyword}
6365
}
6466

67+
// 将原始关键词(小写)加入搜索词列表,确保能匹配索引中存储的完整词条
68+
lowerKeyword := strings.ToLower(strings.TrimSpace(keyword))
69+
if lowerKeyword != "" {
70+
found := false
71+
for _, w := range words {
72+
if w == lowerKeyword {
73+
found = true
74+
break
75+
}
76+
}
77+
if !found {
78+
words = append(words, lowerKeyword)
79+
}
80+
}
81+
6582
// 使用倒排索引模型进行搜索
6683
index := models.NewContentReverseIndex()
67-
results, totalCount, err := index.FindByWordsWithPagination(words, pageIndex, pageSize)
84+
allResults, err := index.FindByWords(words)
6885
if err != nil {
6986
return nil, words, 0, err
7087
}
7188

89+
// 收集需要批量查询的ID
90+
docIds := make([]int, 0)
91+
blogIds := make([]int, 0)
92+
for _, result := range allResults {
93+
if result.ContentType == 1 {
94+
docIds = append(docIds, result.ContentId)
95+
} else if result.ContentType == 2 {
96+
blogIds = append(blogIds, result.ContentId)
97+
}
98+
}
99+
100+
// 批量加载 Document
101+
docMap := make(map[int]*models.Document)
102+
if len(docIds) > 0 {
103+
var docs []*models.Document
104+
o := orm.NewOrm()
105+
_, err := o.QueryTable(models.NewDocument().TableNameWithPrefix()).Filter("document_id__in", docIds).All(&docs)
106+
if err == nil {
107+
for _, doc := range docs {
108+
docMap[doc.DocumentId] = doc
109+
}
110+
}
111+
}
112+
113+
// 批量加载 Blog
114+
blogMap := make(map[int]*models.Blog)
115+
if len(blogIds) > 0 {
116+
var blogs []*models.Blog
117+
o := orm.NewOrm()
118+
_, err := o.QueryTable(models.NewBlog().TableNameWithPrefix()).Filter("blog_id__in", blogIds).All(&blogs)
119+
if err == nil {
120+
for _, blog := range blogs {
121+
blogMap[blog.BlogId] = blog
122+
}
123+
}
124+
}
125+
126+
// 收集需要加载的 BookId 和 MemberId
127+
bookIds := make([]int, 0)
128+
memberIds := make([]int, 0)
129+
bookIdSet := make(map[int]bool)
130+
memberIdSet := make(map[int]bool)
131+
for _, doc := range docMap {
132+
if doc.BookId > 0 && !bookIdSet[doc.BookId] {
133+
bookIds = append(bookIds, doc.BookId)
134+
bookIdSet[doc.BookId] = true
135+
}
136+
if doc.MemberId > 0 && !memberIdSet[doc.MemberId] {
137+
memberIds = append(memberIds, doc.MemberId)
138+
memberIdSet[doc.MemberId] = true
139+
}
140+
}
141+
for _, blog := range blogMap {
142+
if blog.MemberId > 0 && !memberIdSet[blog.MemberId] {
143+
memberIds = append(memberIds, blog.MemberId)
144+
memberIdSet[blog.MemberId] = true
145+
}
146+
}
147+
148+
// 批量加载 Book
149+
bookMap := make(map[int]*models.Book)
150+
if len(bookIds) > 0 {
151+
var books []*models.Book
152+
o := orm.NewOrm()
153+
_, err := o.QueryTable(models.NewBook().TableNameWithPrefix()).Filter("book_id__in", bookIds).All(&books)
154+
if err == nil {
155+
for _, book := range books {
156+
bookMap[book.BookId] = book
157+
}
158+
}
159+
}
160+
161+
// 批量加载 Member
162+
memberMap := make(map[int]*models.Member)
163+
if len(memberIds) > 0 {
164+
var members []*models.Member
165+
o := orm.NewOrm()
166+
_, err := o.QueryTable(models.NewMember().TableNameWithPrefix()).Filter("member_id__in", memberIds).All(&members, "member_id", "account", "real_name")
167+
if err == nil {
168+
for _, member := range members {
169+
memberMap[member.MemberId] = member
170+
}
171+
}
172+
}
173+
72174
// 构建返回结果
73175
searchResults := make([]*SearchV2RawResult, 0)
74-
for _, result := range results {
176+
for _, result := range allResults {
75177
item := &SearchV2RawResult{
76178
ContentType: result.ContentType,
77179
ContentId: result.ContentId,
@@ -81,100 +183,150 @@ func PerformSearchV2Raw(keyword string, pageIndex, pageSize int, memberId int) (
81183

82184
// 根据内容类型获取详细信息
83185
if result.ContentType == 1 {
84-
// Document类型
85-
doc, err := models.NewDocument().Find(result.ContentId)
86-
if err == nil {
87-
// 检查文档权限
88-
book, bookErr := models.NewBook().Find(doc.BookId)
89-
if bookErr != nil {
90-
continue
186+
doc, ok := docMap[result.ContentId]
187+
if !ok {
188+
continue
189+
}
190+
book, ok := bookMap[doc.BookId]
191+
if !ok {
192+
continue
193+
}
194+
195+
item.SearchType = "document"
196+
item.DocumentId = doc.DocumentId
197+
item.DocumentName = doc.DocumentName
198+
item.BookId = doc.BookId
199+
item.BookName = book.BookName
200+
item.Identify = doc.Identify
201+
item.BookIdentify = book.Identify
202+
item.CreateTime = doc.CreateTime
203+
item.ModifyTime = doc.ModifyTime
204+
item.Content = doc.Release
205+
206+
// 获取作者信息
207+
if member, ok := memberMap[doc.MemberId]; ok {
208+
if member.RealName != "" {
209+
item.Author = member.RealName
210+
} else {
211+
item.Author = member.Account
91212
}
213+
}
92214

93-
item.SearchType = "document"
94-
item.DocumentId = doc.DocumentId
95-
item.DocumentName = doc.DocumentName
96-
item.BookId = doc.BookId
97-
item.BookName = book.BookName
98-
item.Identify = doc.Identify
99-
item.BookIdentify = book.Identify
100-
item.CreateTime = doc.CreateTime
101-
item.ModifyTime = doc.ModifyTime
102-
item.Content = doc.Release
103-
104-
// 获取作者信息
105-
if doc.MemberId > 0 {
106-
member, _ := models.NewMember().Find(doc.MemberId, "real_name", "account")
107-
if member != nil {
108-
if member.RealName != "" {
109-
item.Author = member.RealName
110-
} else {
111-
item.Author = member.Account
112-
}
113-
}
215+
// 提取描述
216+
description := doc.Release
217+
if description == "" {
218+
description = doc.Markdown
219+
}
220+
description = utils.StripTags(description)
221+
if len([]rune(description)) > 100 {
222+
description = string([]rune(description)[:100]) + "..."
223+
}
224+
item.Description = description
225+
226+
// 标题匹配加权:搜索词命中标题时提升分数
227+
titleLower := strings.ToLower(doc.DocumentName)
228+
for _, w := range words {
229+
if strings.Contains(titleLower, w) {
230+
item.Score *= 1.5
114231
}
232+
}
115233

116-
// 提取描述
117-
description := doc.Release
118-
if description == "" {
119-
description = doc.Markdown
234+
// 精确匹配加权:文档内容包含原始关键词时大幅提分
235+
if lowerKeyword != "" {
236+
contentLower := strings.ToLower(utils.StripTags(doc.Release))
237+
if contentLower == "" {
238+
contentLower = strings.ToLower(utils.StripTags(doc.Markdown))
120239
}
121-
// 去除HTML标签
122-
description = utils.StripTags(description)
123-
if len([]rune(description)) > 100 {
124-
description = string([]rune(description)[:100]) + "..."
240+
if strings.Contains(contentLower, lowerKeyword) {
241+
item.Score *= 5.0
125242
}
126-
item.Description = description
127-
128-
searchResults = append(searchResults, item)
129243
}
244+
245+
searchResults = append(searchResults, item)
130246
} else if result.ContentType == 2 {
131-
// Blog类型
132-
blog, err := models.NewBlog().Find(result.ContentId)
133-
if err == nil {
134-
item.SearchType = "blog"
135-
item.BlogId = blog.BlogId
136-
item.BlogTitle = blog.BlogTitle
137-
item.DocumentId = blog.BlogId
138-
item.DocumentName = blog.BlogTitle
139-
item.BlogIdentify = blog.BlogIdentify
140-
item.Identify = blog.BlogIdentify
141-
item.BlogExcerpt = blog.BlogExcerpt
142-
item.CreateTime = blog.Created
143-
item.ModifyTime = blog.Modified
144-
item.Content = blog.BlogRelease
145-
146-
// 获取作者信息
147-
if blog.MemberId > 0 {
148-
member, _ := models.NewMember().Find(blog.MemberId, "real_name", "account")
149-
if member != nil {
150-
if member.RealName != "" {
151-
item.Author = member.RealName
152-
} else {
153-
item.Author = member.Account
154-
}
155-
}
247+
blog, ok := blogMap[result.ContentId]
248+
if !ok {
249+
continue
250+
}
251+
252+
item.SearchType = "blog"
253+
item.BlogId = blog.BlogId
254+
item.BlogTitle = blog.BlogTitle
255+
item.DocumentId = blog.BlogId
256+
item.DocumentName = blog.BlogTitle
257+
item.BlogIdentify = blog.BlogIdentify
258+
item.Identify = blog.BlogIdentify
259+
item.BlogExcerpt = blog.BlogExcerpt
260+
item.CreateTime = blog.Created
261+
item.ModifyTime = blog.Modified
262+
item.Content = blog.BlogRelease
263+
264+
// 获取作者信息
265+
if member, ok := memberMap[blog.MemberId]; ok {
266+
if member.RealName != "" {
267+
item.Author = member.RealName
268+
} else {
269+
item.Author = member.Account
156270
}
271+
}
157272

158-
// 提取描述
159-
description := blog.BlogExcerpt
273+
// 提取描述
274+
description := blog.BlogExcerpt
275+
if description == "" {
276+
description = blog.BlogRelease
160277
if description == "" {
161-
description = blog.BlogRelease
162-
if description == "" {
163-
description = blog.BlogContent
164-
}
278+
description = blog.BlogContent
165279
}
166-
description = utils.StripTags(description)
167-
if len([]rune(description)) > 100 {
168-
description = string([]rune(description)[:100]) + "..."
280+
}
281+
description = utils.StripTags(description)
282+
if len([]rune(description)) > 100 {
283+
description = string([]rune(description)[:100]) + "..."
284+
}
285+
item.Description = description
286+
287+
// 标题匹配加权:搜索词命中标题时提升分数
288+
titleLower := strings.ToLower(blog.BlogTitle)
289+
for _, w := range words {
290+
if strings.Contains(titleLower, w) {
291+
item.Score *= 1.5
169292
}
170-
item.Description = description
293+
}
171294

172-
searchResults = append(searchResults, item)
295+
// 精确匹配加权:博客内容包含原始关键词时大幅提分
296+
if lowerKeyword != "" {
297+
contentLower := strings.ToLower(utils.StripTags(blog.BlogRelease))
298+
if contentLower == "" {
299+
contentLower = strings.ToLower(utils.StripTags(blog.BlogContent))
300+
}
301+
if strings.Contains(contentLower, lowerKeyword) {
302+
item.Score *= 5.0
303+
}
173304
}
305+
306+
searchResults = append(searchResults, item)
174307
}
175308
}
176309

177-
return searchResults, words, totalCount, nil
310+
// 按加权后的分数重新排序
311+
sort.Slice(searchResults, func(i, j int) bool {
312+
return searchResults[i].Score > searchResults[j].Score
313+
})
314+
315+
// 分页
316+
totalCount := len(searchResults)
317+
offset := (pageIndex - 1) * pageSize
318+
end := offset + pageSize
319+
if offset > totalCount {
320+
offset = totalCount
321+
}
322+
if end > totalCount {
323+
end = totalCount
324+
}
325+
if offset >= end {
326+
return nil, words, totalCount, nil
327+
}
328+
329+
return searchResults[offset:end], words, totalCount, nil
178330
}
179331

180332
// performSearchV2 执行倒排索引搜索,返回 SearchV2Result 列表

models/Blog.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,8 +252,9 @@ func (b *Blog) Save(cols ...string) error {
252252
go func(blogId int, blogTitle, blogRelease, blogContent string) {
253253
content := blogRelease
254254
if content == "" {
255-
content = blogTitle + "\n" + blogContent
255+
content = blogContent
256256
}
257+
content = blogTitle + "\n" + content
257258
content = utils.StripTags(content)
258259
if err := BuildIndexForBlog(blogId, content); err != nil {
259260
logs.Error("构建Blog倒排索引失败 ->", blogId, err)

0 commit comments

Comments
 (0)