@@ -99,7 +99,7 @@ def encode(html)
9999 end
100100
101101 def encode_and_link_urls ( html , accounts = nil , options = { } )
102- entities = Extractor . extract_entities_with_indices ( html , extract_url_without_protocol : false )
102+ entities = utf8_friendly_extractor ( html , extract_url_without_protocol : false )
103103
104104 if accounts . is_a? ( Hash )
105105 options = accounts
@@ -199,6 +199,43 @@ def rewrite(text, entities)
199199 result . flatten . join
200200 end
201201
202+ def utf8_friendly_extractor ( text , options = { } )
203+ old_to_new_index = [ 0 ]
204+
205+ escaped = text . chars . map do |c |
206+ output = c . ord . to_s ( 16 ) . length > 2 ? CGI . escape ( c ) : c
207+ old_to_new_index << old_to_new_index . last + output . length
208+ output
209+ end . join
210+
211+ # Note: I couldn't obtain list_slug with @user/list-name format
212+ # for mention so this requires additional check
213+ special = Extractor . extract_entities_with_indices ( escaped , options ) . map do |extract |
214+ # exactly one of :url, :hashtag, :screen_name, :cashtag keys is present
215+ key = ( extract . keys & [ :url , :hashtag , :screen_name , :cashtag ] ) . first
216+
217+ new_indices = [
218+ old_to_new_index . find_index ( extract [ :indices ] . first ) ,
219+ old_to_new_index . find_index ( extract [ :indices ] . last ) ,
220+ ]
221+
222+ has_prefix_char = [ :hashtag , :screen_name , :cashtag ] . include? ( key )
223+ value_indices = [
224+ new_indices . first + ( has_prefix_char ? 1 : 0 ) , # account for #, @ or $
225+ new_indices . last - 1 ,
226+ ]
227+
228+ next extract . merge (
229+ :indices => new_indices ,
230+ key => text [ value_indices . first ..value_indices . last ]
231+ )
232+ end
233+
234+ standard = Extractor . extract_entities_with_indices ( text , options )
235+
236+ Extractor . remove_overlapping_entities ( special + standard )
237+ end
238+
202239 def link_to_url ( entity , options = { } )
203240 url = Addressable ::URI . parse ( entity [ :url ] )
204241 html_attrs = { target : '_blank' , rel : 'nofollow noopener' }
0 commit comments