Skip to content

Commit 6a5e3da

Browse files
JMendykGargron
authored andcommitted
Allow most kinds of characters in URL query (fixes mastodon#8408) (mastodon#8447)
* Allow unicode characters in URL query strings Fixes mastodon#8408 * Alternative approach to unicode support in urls Adds PoC/idea to approch this problem.
1 parent 5092d17 commit 6a5e3da

2 files changed

Lines changed: 67 additions & 4 deletions

File tree

app/lib/formatter.rb

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def encode(html)
9999
end
100100

101101
def encode_and_link_urls(html, accounts = nil, options = {})
102-
entities = Extractor.extract_entities_with_indices(html, extract_url_without_protocol: false)
102+
entities = utf8_friendly_extractor(html, extract_url_without_protocol: false)
103103

104104
if accounts.is_a?(Hash)
105105
options = accounts
@@ -199,6 +199,43 @@ def rewrite(text, entities)
199199
result.flatten.join
200200
end
201201

202+
def utf8_friendly_extractor(text, options = {})
203+
old_to_new_index = [0]
204+
205+
escaped = text.chars.map do |c|
206+
output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c
207+
old_to_new_index << old_to_new_index.last + output.length
208+
output
209+
end.join
210+
211+
# Note: I couldn't obtain list_slug with @user/list-name format
212+
# for mention so this requires additional check
213+
special = Extractor.extract_entities_with_indices(escaped, options).map do |extract|
214+
# exactly one of :url, :hashtag, :screen_name, :cashtag keys is present
215+
key = (extract.keys & [:url, :hashtag, :screen_name, :cashtag]).first
216+
217+
new_indices = [
218+
old_to_new_index.find_index(extract[:indices].first),
219+
old_to_new_index.find_index(extract[:indices].last),
220+
]
221+
222+
has_prefix_char = [:hashtag, :screen_name, :cashtag].include?(key)
223+
value_indices = [
224+
new_indices.first + (has_prefix_char ? 1 : 0), # account for #, @ or $
225+
new_indices.last - 1,
226+
]
227+
228+
next extract.merge(
229+
:indices => new_indices,
230+
key => text[value_indices.first..value_indices.last]
231+
)
232+
end
233+
234+
standard = Extractor.extract_entities_with_indices(text, options)
235+
236+
Extractor.remove_overlapping_entities(special + standard)
237+
end
238+
202239
def link_to_url(entity, options = {})
203240
url = Addressable::URI.parse(entity[:url])
204241
html_attrs = { target: '_blank', rel: 'nofollow noopener' }

spec/lib/formatter_spec.rb

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,36 @@
7474
end
7575

7676
context 'given a URL with a query string' do
77-
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' }
77+
context 'with escaped unicode character' do
78+
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' }
7879

79-
it 'matches the full URL' do
80-
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&amp;q=autolink"'
80+
it 'matches the full URL' do
81+
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&amp;q=autolink"'
82+
end
83+
end
84+
85+
context 'with unicode character' do
86+
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓&q=autolink' }
87+
88+
it 'matches the full URL' do
89+
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓&amp;q=autolink"'
90+
end
91+
end
92+
93+
context 'with unicode character at the end' do
94+
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓' }
95+
96+
it 'matches the full URL' do
97+
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓"'
98+
end
99+
end
100+
101+
context 'with escaped and not escaped unicode characters' do
102+
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink' }
103+
104+
it 'preserves escaped unicode characters' do
105+
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&amp;utf81=✓&amp;q=autolink"'
106+
end
81107
end
82108
end
83109

0 commit comments

Comments
 (0)