@@ -100,7 +100,7 @@ def encode(html)
100100 end
101101
102102 def encode_and_link_urls ( html , accounts = nil , options = { } )
103- entities = Extractor . extract_entities_with_indices ( html , extract_url_without_protocol : false )
103+ entities = utf8_friendly_extractor ( html , extract_url_without_protocol : false )
104104
105105 if accounts . is_a? ( Hash )
106106 options = accounts
@@ -204,6 +204,43 @@ def rewrite(text, entities)
204204 result . flatten . join
205205 end
206206
207+ def utf8_friendly_extractor ( text , options = { } )
208+ old_to_new_index = [ 0 ]
209+
210+ escaped = text . chars . map do |c |
211+ output = c . ord . to_s ( 16 ) . length > 2 ? CGI . escape ( c ) : c
212+ old_to_new_index << old_to_new_index . last + output . length
213+ output
214+ end . join
215+
216+ # Note: I couldn't obtain list_slug with @user/list-name format
217+ # for mention so this requires additional check
218+ special = Extractor . extract_entities_with_indices ( escaped , options ) . map do |extract |
219+ # exactly one of :url, :hashtag, :screen_name, :cashtag keys is present
220+ key = ( extract . keys & [ :url , :hashtag , :screen_name , :cashtag ] ) . first
221+
222+ new_indices = [
223+ old_to_new_index . find_index ( extract [ :indices ] . first ) ,
224+ old_to_new_index . find_index ( extract [ :indices ] . last ) ,
225+ ]
226+
227+ has_prefix_char = [ :hashtag , :screen_name , :cashtag ] . include? ( key )
228+ value_indices = [
229+ new_indices . first + ( has_prefix_char ? 1 : 0 ) , # account for #, @ or $
230+ new_indices . last - 1 ,
231+ ]
232+
233+ next extract . merge (
234+ :indices => new_indices ,
235+ key => text [ value_indices . first ..value_indices . last ]
236+ )
237+ end
238+
239+ standard = Extractor . extract_entities_with_indices ( text , options )
240+
241+ Extractor . remove_overlapping_entities ( special + standard )
242+ end
243+
207244 def link_to_url ( entity , options = { } )
208245 url = Addressable ::URI . parse ( entity [ :url ] )
209246 html_attrs = { target : '_blank' , rel : 'nofollow noopener' }
0 commit comments