Skip to content

Commit a7359eb

Browse files
Gargronhiyuki2578
authored andcommitted
Change spam check to apply to local accounts and add a threshold (mastodon#11806)
Instead of detecting spam on first duplicate message, add a threshold of 5 such messages to reduce false positives
1 parent 0a0d500 commit a7359eb

4 files changed

Lines changed: 66 additions & 29 deletions

File tree

app/lib/activitypub/activity/create.rb

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -408,15 +408,7 @@ def addresses_local_accounts?
408408
end
409409

410410
def check_for_spam
411-
spam_check = SpamCheck.new(@status)
412-
413-
return if spam_check.skip?
414-
415-
if spam_check.spam?
416-
spam_check.flag!
417-
else
418-
spam_check.remember!
419-
end
411+
SpamCheck.perform(@status)
420412
end
421413

422414
def forward_for_reply

app/lib/spam_check.rb

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,25 @@ class SpamCheck
44
include Redisable
55
include ActionView::Helpers::TextHelper
66

7+
# Threshold over which two Nilsimsa values are considered
8+
# to refer to the same text
79
NILSIMSA_COMPARE_THRESHOLD = 95
8-
NILSIMSA_MIN_SIZE = 10
9-
EXPIRE_SET_AFTER = 1.week.seconds
10+
11+
# Nilsimsa doesn't work well on small inputs, so below
12+
# this size, we check only for exact matches with MD5
13+
NILSIMSA_MIN_SIZE = 10
14+
15+
# How long to keep the trail of digests between updates,
16+
# there is no reason to store it forever
17+
EXPIRE_SET_AFTER = 1.week.seconds
18+
19+
# How many digests to keep in an account's trail. If it's
20+
# too small, spam could rotate around different message templates
21+
MAX_TRAIL_SIZE = 10
22+
23+
# How many detected duplicates to allow through before
24+
# considering the message as spam
25+
THRESHOLD = 5
1026

1127
def initialize(status)
1228
@account = status.account
@@ -21,9 +37,9 @@ def spam?
2137
if insufficient_data?
2238
false
2339
elsif nilsimsa?
24-
any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
40+
digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
2541
else
26-
any_other_digest?('md5') { |_, other_digest| other_digest == digest }
42+
digests_over_threshold?('md5') { |_, other_digest| other_digest == digest }
2743
end
2844
end
2945

@@ -38,7 +54,7 @@ def remember!
3854
# get the correct status ID back, we have to save it in the string value
3955

4056
redis.zadd(redis_key, @status.id, digest_with_algorithm)
41-
redis.zremrangebyrank(redis_key, '0', '-10')
57+
redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1))
4258
redis.expire(redis_key, EXPIRE_SET_AFTER)
4359
end
4460

@@ -78,6 +94,20 @@ def digest_with_algorithm
7894
end
7995
end
8096

97+
class << self
98+
def perform(status)
99+
spam_check = new(status)
100+
101+
return if spam_check.skip?
102+
103+
if spam_check.spam?
104+
spam_check.flag!
105+
else
106+
spam_check.remember!
107+
end
108+
end
109+
end
110+
81111
private
82112

83113
def disabled?
@@ -149,14 +179,14 @@ def other_digests
149179
redis.zrange(redis_key, 0, -1)
150180
end
151181

152-
def any_other_digest?(filter_algorithm)
153-
other_digests.any? do |record|
182+
def digests_over_threshold?(filter_algorithm)
183+
other_digests.select do |record|
154184
algorithm, other_digest, status_id = record.split(':')
155185

156186
next unless algorithm == filter_algorithm
157187

158188
yield algorithm, other_digest, status_id
159-
end
189+
end.size >= THRESHOLD
160190
end
161191

162192
def matching_status_ids

app/services/process_mentions_service.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def call(status)
3333
end
3434

3535
status.save!
36+
check_for_spam(status)
3637

3738
mentions.each { |mention| create_notification(mention) }
3839
end
@@ -61,4 +62,8 @@ def activitypub_json
6162
def resolve_account_service
6263
ResolveAccountService.new
6364
end
65+
66+
def check_for_spam(status)
67+
SpamCheck.perform(status)
68+
end
6469
end

spec/lib/spam_check_spec.rb

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -86,23 +86,33 @@ def status_with_html(text, options = {})
8686
end
8787

8888
it 'returns true for duplicate statuses to the same recipient' do
89-
status1 = status_with_html('@alice Hello')
90-
described_class.new(status1).remember!
89+
described_class::THRESHOLD.times do
90+
status1 = status_with_html('@alice Hello')
91+
described_class.new(status1).remember!
92+
end
93+
9194
status2 = status_with_html('@alice Hello')
9295
expect(described_class.new(status2).spam?).to be true
9396
end
9497

9598
it 'returns true for duplicate statuses to different recipients' do
96-
status1 = status_with_html('@alice Hello')
97-
described_class.new(status1).remember!
99+
described_class::THRESHOLD.times do
100+
status1 = status_with_html('@alice Hello')
101+
described_class.new(status1).remember!
102+
end
103+
98104
status2 = status_with_html('@bob Hello')
99105
expect(described_class.new(status2).spam?).to be true
100106
end
101107

102108
it 'returns true for nearly identical statuses with random numbers' do
103109
source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.'
104-
status1 = status_with_html('@alice ' + source_text + ' 1234')
105-
described_class.new(status1).remember!
110+
111+
described_class::THRESHOLD.times do
112+
status1 = status_with_html('@alice ' + source_text + ' 1234')
113+
described_class.new(status1).remember!
114+
end
115+
106116
status2 = status_with_html('@bob ' + source_text + ' 9568')
107117
expect(described_class.new(status2).spam?).to be true
108118
end
@@ -140,9 +150,9 @@ def status_with_html(text, options = {})
140150
let(:redis_key) { spam_check.send(:redis_key) }
141151

142152
it 'remembers' do
143-
expect do
144-
spam_check.remember!
145-
end.to change { Redis.current.exists(redis_key) }.from(false).to(true)
153+
expect(Redis.current.exists(redis_key)).to be true
154+
spam_check.remember!
155+
expect(Redis.current.exists(redis_key)).to be true
146156
end
147157
end
148158

@@ -156,9 +166,9 @@ def status_with_html(text, options = {})
156166
end
157167

158168
it 'resets' do
159-
expect do
160-
spam_check.reset!
161-
end.to change { Redis.current.exists(redis_key) }.from(true).to(false)
169+
expect(Redis.current.exists(redis_key)).to be true
170+
spam_check.reset!
171+
expect(Redis.current.exists(redis_key)).to be false
162172
end
163173
end
164174

0 commit comments

Comments
 (0)