@@ -4,9 +4,25 @@ class SpamCheck
44 include Redisable
55 include ActionView ::Helpers ::TextHelper
66
7+ # Threshold over which two Nilsimsa values are considered
8+ # to refer to the same text
79 NILSIMSA_COMPARE_THRESHOLD = 95
8- NILSIMSA_MIN_SIZE = 10
9- EXPIRE_SET_AFTER = 1 . week . seconds
10+
11+ # Nilsimsa doesn't work well on small inputs, so below
12+ # this size, we check only for exact matches with MD5
13+ NILSIMSA_MIN_SIZE = 10
14+
15+ # How long to keep the trail of digests between updates,
16+ # there is no reason to store it forever
17+ EXPIRE_SET_AFTER = 1 . week . seconds
18+
19+ # How many digests to keep in an account's trail. If it's
20+ # too small, spam could rotate around different message templates
21+ MAX_TRAIL_SIZE = 10
22+
23+ # How many detected duplicates to allow through before
24+ # considering the message as spam
25+ THRESHOLD = 5
1026
1127 def initialize ( status )
1228 @account = status . account
@@ -21,9 +37,9 @@ def spam?
2137 if insufficient_data?
2238 false
2339 elsif nilsimsa?
24- any_other_digest ?( 'nilsimsa' ) { |_ , other_digest | nilsimsa_compare_value ( digest , other_digest ) >= NILSIMSA_COMPARE_THRESHOLD }
40+ digests_over_threshold ?( 'nilsimsa' ) { |_ , other_digest | nilsimsa_compare_value ( digest , other_digest ) >= NILSIMSA_COMPARE_THRESHOLD }
2541 else
26- any_other_digest ?( 'md5' ) { |_ , other_digest | other_digest == digest }
42+ digests_over_threshold ?( 'md5' ) { |_ , other_digest | other_digest == digest }
2743 end
2844 end
2945
@@ -38,7 +54,7 @@ def remember!
3854 # get the correct status ID back, we have to save it in the string value
3955
4056 redis . zadd ( redis_key , @status . id , digest_with_algorithm )
41- redis . zremrangebyrank ( redis_key , '0' , '-10' )
57+ redis . zremrangebyrank ( redis_key , 0 , - ( MAX_TRAIL_SIZE + 1 ) )
4258 redis . expire ( redis_key , EXPIRE_SET_AFTER )
4359 end
4460
@@ -78,6 +94,20 @@ def digest_with_algorithm
7894 end
7995 end
8096
97+ class << self
98+ def perform ( status )
99+ spam_check = new ( status )
100+
101+ return if spam_check . skip?
102+
103+ if spam_check . spam?
104+ spam_check . flag!
105+ else
106+ spam_check . remember!
107+ end
108+ end
109+ end
110+
81111 private
82112
83113 def disabled?
@@ -149,14 +179,14 @@ def other_digests
149179 redis . zrange ( redis_key , 0 , -1 )
150180 end
151181
152- def any_other_digest ?( filter_algorithm )
153- other_digests . any? do |record |
182+ def digests_over_threshold ?( filter_algorithm )
183+ other_digests . select do |record |
154184 algorithm , other_digest , status_id = record . split ( ':' )
155185
156186 next unless algorithm == filter_algorithm
157187
158188 yield algorithm , other_digest , status_id
159- end
189+ end . size >= THRESHOLD
160190 end
161191
162192 def matching_status_ids
0 commit comments