Skip to content

Commit 72cb7ce

Browse files
Gargronmayaeh
authored andcommitted
Add more accurate hashtag search (mastodon#11579)
* Add more accurate hashtag search Using ElasticSearch to index hashtags with edge n-grams and score them by usage within the last 7 days since last activity. Only hashtags that have been reviewed and are listable can appear in searches, unless they match the query exactly * Fix search analyzer dropping non-ascii characters
1 parent 791c6d2 commit 72cb7ce

10 files changed

Lines changed: 149 additions & 13 deletions

File tree

app/chewy/tags_index.rb

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# frozen_string_literal: true
2+
3+
class TagsIndex < Chewy::Index
4+
settings index: { refresh_interval: '15m' }, analysis: {
5+
analyzer: {
6+
content: {
7+
tokenizer: 'keyword',
8+
filter: %w(lowercase asciifolding cjk_width),
9+
},
10+
11+
edge_ngram: {
12+
tokenizer: 'edge_ngram',
13+
filter: %w(lowercase asciifolding cjk_width),
14+
},
15+
},
16+
17+
tokenizer: {
18+
edge_ngram: {
19+
type: 'edge_ngram',
20+
min_gram: 2,
21+
max_gram: 15,
22+
},
23+
},
24+
}
25+
26+
define_type ::Tag.listable, delete_if: ->(tag) { tag.destroyed? || !tag.listable? } do
27+
root date_detection: false do
28+
field :name, type: 'text', analyzer: 'content' do
29+
field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content'
30+
end
31+
32+
field :reviewed, type: 'boolean', value: ->(tag) { tag.reviewed? }
33+
field :usage, type: 'long', value: ->(tag) { tag.history.reduce(0) { |total, day| total + day[:accounts].to_i } }
34+
field :last_status_at, type: 'date', value: ->(tag) { tag.last_status_at || tag.created_at }
35+
end
36+
end
37+
end

app/models/tag.rb

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
# listable :boolean
1414
# reviewed_at :datetime
1515
# requested_review_at :datetime
16+
# last_status_at :datetime
17+
# last_trend_at :datetime
1618
#
1719

1820
class Tag < ApplicationRecord
@@ -33,7 +35,8 @@ class Tag < ApplicationRecord
3335
scope :unreviewed, -> { where(reviewed_at: nil) }
3436
scope :pending_review, -> { unreviewed.where.not(requested_review_at: nil) }
3537
scope :usable, -> { where(usable: [true, nil]) }
36-
scope :discoverable, -> { where(listable: [true, nil]).joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) }
38+
scope :listable, -> { where(listable: [true, nil]) }
39+
scope :discoverable, -> { listable.joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) }
3740
scope :most_used, ->(account) { joins(:statuses).where(statuses: { account: account }).group(:id).order(Arel.sql('count(*) desc')) }
3841

3942
delegate :accounts_count,
@@ -44,6 +47,8 @@ class Tag < ApplicationRecord
4447

4548
after_save :save_account_tag_stat
4649

50+
update_index('tags#tag', :self) if Chewy.enabled?
51+
4752
def account_tag_stat
4853
super || build_account_tag_stat
4954
end
@@ -121,9 +126,10 @@ def search_for(term, limit = 5, offset = 0)
121126
normalized_term = normalize(term.strip).mb_chars.downcase.to_s
122127
pattern = sanitize_sql_like(normalized_term) + '%'
123128

124-
Tag.where(arel_table[:name].lower.matches(pattern))
125-
.where(arel_table[:score].gt(0).or(arel_table[:name].lower.eq(normalized_term)))
126-
.order(Arel.sql('length(name) ASC, score DESC, name ASC'))
129+
Tag.listable
130+
.where(arel_table[:name].lower.matches(pattern))
131+
.where(arel_table[:name].lower.eq(normalized_term).or(arel_table[:reviewed_at].not_eq(nil)))
132+
.order(Arel.sql('length(name) ASC, name ASC'))
127133
.limit(limit)
128134
.offset(offset)
129135
end

app/models/trending_tags.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ def record_use!(tag, account, at_time = Time.now.utc)
1717
increment_historical_use!(tag.id, at_time)
1818
increment_unique_use!(tag.id, account.id, at_time)
1919
increment_vote!(tag, at_time)
20+
21+
tag.update(last_status_at: Time.now.utc) if tag.last_status_at.nil? || tag.last_status_at < 12.hours.ago
22+
tag.update(last_trend_at: Time.now.utc) if trending?(tag) && (tag.last_trend_at.nil? || tag.last_trend_at < 12.hours.ago)
2023
end
2124

2225
def get(limit, filtered: true)

app/services/account_search_service.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def followers_score_function
109109
field_value_factor: {
110110
field: 'followers_count',
111111
modifier: 'log2p',
112-
missing: 1,
112+
missing: 0,
113113
},
114114
}
115115
end

app/services/search_service.rb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,10 @@ def perform_statuses_search!
5757
end
5858

5959
def perform_hashtags_search!
60-
Tag.search_for(
61-
@query.gsub(/\A#/, ''),
62-
@limit,
63-
@offset
60+
TagSearchService.new.call(
61+
@query,
62+
limit: @limit,
63+
offset: @offset
6464
)
6565
end
6666

app/services/tag_search_service.rb

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# frozen_string_literal: true
2+
3+
class TagSearchService < BaseService
4+
def call(query, options = {})
5+
@query = query.strip.gsub(/\A#/, '')
6+
@offset = options[:offset].to_i
7+
@limit = options[:limit].to_i
8+
9+
if Chewy.enabled?
10+
from_elasticsearch
11+
else
12+
from_database
13+
end
14+
end
15+
16+
private
17+
18+
def from_elasticsearch
19+
query = {
20+
function_score: {
21+
query: {
22+
multi_match: {
23+
query: @query,
24+
fields: %w(name.edge_ngram name),
25+
type: 'most_fields',
26+
operator: 'and',
27+
},
28+
},
29+
30+
functions: [
31+
{
32+
field_value_factor: {
33+
field: 'usage',
34+
modifier: 'log2p',
35+
missing: 0,
36+
},
37+
},
38+
39+
{
40+
gauss: {
41+
last_status_at: {
42+
scale: '7d',
43+
offset: '14d',
44+
decay: 0.5,
45+
},
46+
},
47+
},
48+
],
49+
50+
boost_mode: 'multiply',
51+
},
52+
}
53+
54+
filter = {
55+
bool: {
56+
should: [
57+
{
58+
term: {
59+
reviewed: {
60+
value: true,
61+
},
62+
},
63+
},
64+
65+
{
66+
term: {
67+
name: {
68+
value: @query,
69+
},
70+
},
71+
},
72+
],
73+
},
74+
}
75+
76+
TagsIndex.query(query).filter(filter).limit(@limit).offset(@offset).objects.compact
77+
end
78+
79+
def from_database
80+
Tag.search_for(@query, @limit, @offset)
81+
end
82+
end

config/locales/simple_form.en.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ en:
142142
report: Send e-mail when a new report is submitted
143143
trending_tag: Send e-mail when an unreviewed hashtag is trending
144144
tag:
145-
listable: Allow this hashtag to appear on the profile directory
145+
listable: Allow this hashtag to appear in searches and on the profile directory
146146
trendable: Allow this hashtag to appear under trends
147147
usable: Allow toots to use this hashtag
148148
'no': 'No'
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
class AddLastStatusAtToTags < ActiveRecord::Migration[5.2]
2+
def change
3+
add_column :tags, :last_status_at, :datetime
4+
add_column :tags, :last_trend_at, :datetime
5+
end
6+
end

db/schema.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
#
1111
# It's strongly recommended that you check this file into your version control system.
1212

13-
ActiveRecord::Schema.define(version: 2019_08_07_135426) do
13+
ActiveRecord::Schema.define(version: 2019_08_15_225426) do
1414

1515
# These are extensions that must be enabled in order to support this database
1616
enable_extension "plpgsql"
@@ -667,6 +667,8 @@
667667
t.boolean "listable"
668668
t.datetime "reviewed_at"
669669
t.datetime "requested_review_at"
670+
t.datetime "last_status_at"
671+
t.datetime "last_trend_at"
670672
t.index "lower((name)::text)", name: "index_tags_on_name_lower", unique: true
671673
end
672674

spec/models/tag_spec.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@
136136
end
137137

138138
it 'finds the exact matching tag as the first item' do
139-
similar_tag = Fabricate(:tag, name: "matchlater", score: 1)
140-
tag = Fabricate(:tag, name: "match", score: 1)
139+
similar_tag = Fabricate(:tag, name: "matchlater", reviewed_at: Time.now.utc)
140+
tag = Fabricate(:tag, name: "match", reviewed_at: Time.now.utc)
141141

142142
results = Tag.search_for("match")
143143

0 commit comments

Comments
 (0)