Skip to content

Commit 320eb3a

Browse files
committed
Use Nilsimsa to generate locality-sensitive hashes and compare using Levenshtein distance
1 parent fd23b73 commit 320eb3a

4 files changed

Lines changed: 43 additions & 2 deletions

File tree

Gemfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ gem 'idn-ruby', require: 'idn'
5858
gem 'kaminari', '~> 1.1'
5959
gem 'link_header', '~> 0.0'
6060
gem 'mime-types', '~> 3.2', require: 'mime/types/columnar'
61+
gem 'nilsimsa', git: 'https://github.com/witgo/nilsimsa', ref: 'fd184883048b922b176939f851338d0a4971a532'
6162
gem 'nokogiri', '~> 1.10'
6263
gem 'nsa', '~> 0.2'
6364
gem 'oj', '~> 3.7'

Gemfile.lock

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,13 @@ GIT
1212
specs:
1313
http_parser.rb (0.6.1)
1414

15+
GIT
16+
remote: https://github.com/witgo/nilsimsa
17+
revision: fd184883048b922b176939f851338d0a4971a532
18+
ref: fd18488
19+
specs:
20+
nilsimsa (1.1.2)
21+
1522
GEM
1623
remote: https://rubygems.org/
1724
specs:
@@ -703,6 +710,7 @@ DEPENDENCIES
703710
microformats (~> 4.1)
704711
mime-types (~> 3.2)
705712
net-ldap (~> 0.10)
713+
nilsimsa!
706714
nokogiri (~> 1.10)
707715
nsa (~> 0.2)
708716
oj (~> 3.7)

app/lib/spam_check.rb

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ class SpamCheck
44
include Redisable
55
include ActionView::Helpers::TextHelper
66

7+
LEVENSHTEIN_THRESHOLD = 10
8+
79
def initialize(status)
810
@account = status.account
911
@status = status
@@ -14,7 +16,8 @@ def skip?
1416
end
1517

1618
def spam?
17-
!redis.zrank("spam_check:#{@account.id}", digest).nil?
19+
other_digests = redis.zrange("spam_check:#{@account.id}", '0', '-1')
20+
other_digests.any? { |other_digest| levenshtein(digest, other_digest) < LEVENSHTEIN_THRESHOLD }
1821
end
1922

2023
def flag!
@@ -40,7 +43,7 @@ def hashable_text
4043
end
4144

4245
def digest
43-
@digest ||= Digest::MD5.hexdigest(hashable_text)
46+
@digest ||= Nilsimsa.new(hashable_text).hexdigest
4447
end
4548

4649
def remove_mentions(text)
@@ -76,4 +79,25 @@ def already_flagged?
7679
def no_unsolicited_mentions?
7780
@status.mentions.all? { |mention| mention.silent? || !mention.account.local? || mention.account.following?(@account) }
7881
end
82+
83+
def levenshtein(first, second)
84+
m = first.length
85+
n = second.length
86+
87+
return m if n.zero?
88+
return n if m.zero?
89+
90+
d = Array.new(m + 1) { Array.new(n + 1) }
91+
92+
0.upto(m) { |i| d[i][0] = i }
93+
0.upto(n) { |j| d[0][j] = j }
94+
95+
1.upto(n) do |j|
96+
1.upto(m) do |i|
97+
d[i][j] = first[i - 1] == second[j - 1] ? d[i - 1][j - 1] : [d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + 1].min
98+
end
99+
end
100+
101+
d[m][n]
102+
end
79103
end

spec/lib/spam_check_spec.rb

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,14 @@ def status_with_html(text)
3030
status2 = status_with_html('@bob Hello')
3131
expect(described_class.new(status2).spam?).to be true
3232
end
33+
34+
it 'returns true for nearly identical statuses with random numbers' do
35+
source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.'
36+
status1 = status_with_html('@alice ' + source_text + ' 1234')
37+
described_class.new(status1).remember!
38+
status2 = status_with_html('@bob ' + source_text + ' 9568')
39+
expect(described_class.new(status2).spam?).to be true
40+
end
3341
end
3442

3543
describe '#skip?' do

0 commit comments

Comments
 (0)