|
| 1 | +# frozen_string_literal: true |
| 2 | + |
| 3 | +class SpamCheck |
| 4 | + include Redisable |
| 5 | + include ActionView::Helpers::TextHelper |
| 6 | + |
| 7 | + NILSIMSA_COMPARE_THRESHOLD = 95 |
| 8 | + NILSIMSA_MIN_SIZE = 10 |
| 9 | + EXPIRE_SET_AFTER = 1.week.seconds |
| 10 | + |
| 11 | + def initialize(status) |
| 12 | + @account = status.account |
| 13 | + @status = status |
| 14 | + end |
| 15 | + |
| 16 | + def skip? |
| 17 | + already_flagged? || trusted? || no_unsolicited_mentions? || solicited_reply? |
| 18 | + end |
| 19 | + |
| 20 | + def spam? |
| 21 | + if insufficient_data? |
| 22 | + false |
| 23 | + elsif nilsimsa? |
| 24 | + any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } |
| 25 | + else |
| 26 | + any_other_digest?('md5') { |_, other_digest| other_digest == digest } |
| 27 | + end |
| 28 | + end |
| 29 | + |
| 30 | + def flag! |
| 31 | + auto_silence_account! |
| 32 | + auto_report_status! |
| 33 | + end |
| 34 | + |
| 35 | + def remember! |
| 36 | + # The scores in sorted sets don't actually have enough bits to hold an exact |
| 37 | + # value of our snowflake IDs, so we use it only for its ordering property. To |
| 38 | + # get the correct status ID back, we have to save it in the string value |
| 39 | + |
| 40 | + redis.zadd(redis_key, @status.id, digest_with_algorithm) |
| 41 | + redis.zremrangebyrank(redis_key, '0', '-10') |
| 42 | + redis.expire(redis_key, EXPIRE_SET_AFTER) |
| 43 | + end |
| 44 | + |
| 45 | + def reset! |
| 46 | + redis.del(redis_key) |
| 47 | + end |
| 48 | + |
| 49 | + def hashable_text |
| 50 | + return @hashable_text if defined?(@hashable_text) |
| 51 | + |
| 52 | + @hashable_text = @status.text |
| 53 | + @hashable_text = remove_mentions(@hashable_text) |
| 54 | + @hashable_text = strip_tags(@hashable_text) unless @status.local? |
| 55 | + @hashable_text = normalize_unicode(@status.spoiler_text + ' ' + @hashable_text) |
| 56 | + @hashable_text = remove_whitespace(@hashable_text) |
| 57 | + end |
| 58 | + |
| 59 | + def insufficient_data? |
| 60 | + hashable_text.blank? |
| 61 | + end |
| 62 | + |
| 63 | + def digest |
| 64 | + @digest ||= begin |
| 65 | + if nilsimsa? |
| 66 | + Nilsimsa.new(hashable_text).hexdigest |
| 67 | + else |
| 68 | + Digest::MD5.hexdigest(hashable_text) |
| 69 | + end |
| 70 | + end |
| 71 | + end |
| 72 | + |
| 73 | + def digest_with_algorithm |
| 74 | + if nilsimsa? |
| 75 | + ['nilsimsa', digest, @status.id].join(':') |
| 76 | + else |
| 77 | + ['md5', digest, @status.id].join(':') |
| 78 | + end |
| 79 | + end |
| 80 | + |
| 81 | + private |
| 82 | + |
| 83 | + def remove_mentions(text) |
| 84 | + return text.gsub(Account::MENTION_RE, '') if @status.local? |
| 85 | + |
| 86 | + Nokogiri::HTML.fragment(text).tap do |html| |
| 87 | + mentions = @status.mentions.map { |mention| ActivityPub::TagManager.instance.url_for(mention.account) } |
| 88 | + |
| 89 | + html.traverse do |element| |
| 90 | + element.unlink if element.name == 'a' && mentions.include?(element['href']) |
| 91 | + end |
| 92 | + end.to_s |
| 93 | + end |
| 94 | + |
| 95 | + def normalize_unicode(text) |
| 96 | + text.unicode_normalize(:nfkc).downcase |
| 97 | + end |
| 98 | + |
| 99 | + def remove_whitespace(text) |
| 100 | + text.gsub(/\s+/, ' ').strip |
| 101 | + end |
| 102 | + |
| 103 | + def auto_silence_account! |
| 104 | + @account.silence! |
| 105 | + end |
| 106 | + |
| 107 | + def auto_report_status! |
| 108 | + status_ids = Status.where(visibility: %i(public unlisted)).where(id: matching_status_ids).pluck(:id) + [@status.id] if @status.distributable? |
| 109 | + ReportService.new.call(Account.representative, @account, status_ids: status_ids, comment: I18n.t('spam_check.spam_detected_and_silenced')) |
| 110 | + end |
| 111 | + |
| 112 | + def already_flagged? |
| 113 | + @account.silenced? |
| 114 | + end |
| 115 | + |
| 116 | + def trusted? |
| 117 | + @account.trust_level > Account::TRUST_LEVELS[:untrusted] |
| 118 | + end |
| 119 | + |
| 120 | + def no_unsolicited_mentions? |
| 121 | + @status.mentions.all? { |mention| mention.silent? || (!@account.local? && !mention.account.local?) || mention.account.following?(@account) } |
| 122 | + end |
| 123 | + |
| 124 | + def solicited_reply? |
| 125 | + !@status.thread.nil? && @status.thread.mentions.where(account: @account).exists? |
| 126 | + end |
| 127 | + |
| 128 | + def nilsimsa_compare_value(first, second) |
| 129 | + first = [first].pack('H*') |
| 130 | + second = [second].pack('H*') |
| 131 | + bits = 0 |
| 132 | + |
| 133 | + 0.upto(31) do |i| |
| 134 | + bits += Nilsimsa::POPC[255 & (first[i].ord ^ second[i].ord)].ord |
| 135 | + end |
| 136 | + |
| 137 | + 128 - bits # -128 <= Nilsimsa Compare Value <= 128 |
| 138 | + end |
| 139 | + |
| 140 | + def nilsimsa? |
| 141 | + hashable_text.size > NILSIMSA_MIN_SIZE |
| 142 | + end |
| 143 | + |
| 144 | + def other_digests |
| 145 | + redis.zrange(redis_key, 0, -1) |
| 146 | + end |
| 147 | + |
| 148 | + def any_other_digest?(filter_algorithm) |
| 149 | + other_digests.any? do |record| |
| 150 | + algorithm, other_digest, status_id = record.split(':') |
| 151 | + |
| 152 | + next unless algorithm == filter_algorithm |
| 153 | + |
| 154 | + yield algorithm, other_digest, status_id |
| 155 | + end |
| 156 | + end |
| 157 | + |
| 158 | + def matching_status_ids |
| 159 | + if nilsimsa? |
| 160 | + other_digests.select { |record| record.start_with?('nilsimsa') && nilsimsa_compare_value(digest, record.split(':')[1]) >= NILSIMSA_COMPARE_THRESHOLD }.map { |record| record.split(':')[2] }.compact |
| 161 | + else |
| 162 | + other_digests.select { |record| record.start_with?('md5') && record.split(':')[1] == digest }.map { |record| record.split(':')[2] }.compact |
| 163 | + end |
| 164 | + end |
| 165 | + |
| 166 | + def redis_key |
| 167 | + @redis_key ||= "spam_check:#{@account.id}" |
| 168 | + end |
| 169 | +end |
0 commit comments