Skip to content

Commit 8dcdb22

Browse files
JMendykhiyuki2578
authored andcommitted
Allow most kinds of characters in URL query (fixes mastodon#8408) (mastodon#8447)
* Allow unicode characters in URL query strings Fixes mastodon#8408 * Alternative approach to unicode support in urls Adds PoC/idea to approch this problem.
1 parent b7382b0 commit 8dcdb22

2 files changed

Lines changed: 67 additions & 4 deletions

File tree

app/lib/formatter.rb

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def encode(html)
100100
end
101101

102102
def encode_and_link_urls(html, accounts = nil, options = {})
103-
entities = Extractor.extract_entities_with_indices(html, extract_url_without_protocol: false)
103+
entities = utf8_friendly_extractor(html, extract_url_without_protocol: false)
104104

105105
if accounts.is_a?(Hash)
106106
options = accounts
@@ -204,6 +204,43 @@ def rewrite(text, entities)
204204
result.flatten.join
205205
end
206206

207+
def utf8_friendly_extractor(text, options = {})
208+
old_to_new_index = [0]
209+
210+
escaped = text.chars.map do |c|
211+
output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c
212+
old_to_new_index << old_to_new_index.last + output.length
213+
output
214+
end.join
215+
216+
# Note: I couldn't obtain list_slug with @user/list-name format
217+
# for mention so this requires additional check
218+
special = Extractor.extract_entities_with_indices(escaped, options).map do |extract|
219+
# exactly one of :url, :hashtag, :screen_name, :cashtag keys is present
220+
key = (extract.keys & [:url, :hashtag, :screen_name, :cashtag]).first
221+
222+
new_indices = [
223+
old_to_new_index.find_index(extract[:indices].first),
224+
old_to_new_index.find_index(extract[:indices].last),
225+
]
226+
227+
has_prefix_char = [:hashtag, :screen_name, :cashtag].include?(key)
228+
value_indices = [
229+
new_indices.first + (has_prefix_char ? 1 : 0), # account for #, @ or $
230+
new_indices.last - 1,
231+
]
232+
233+
next extract.merge(
234+
:indices => new_indices,
235+
key => text[value_indices.first..value_indices.last]
236+
)
237+
end
238+
239+
standard = Extractor.extract_entities_with_indices(text, options)
240+
241+
Extractor.remove_overlapping_entities(special + standard)
242+
end
243+
207244
def link_to_url(entity, options = {})
208245
url = Addressable::URI.parse(entity[:url])
209246
html_attrs = { target: '_blank', rel: 'nofollow noopener' }

spec/lib/formatter_spec.rb

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,36 @@
7474
end
7575

7676
context 'given a URL with a query string' do
77-
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' }
77+
context 'with escaped unicode character' do
78+
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' }
7879

79-
it 'matches the full URL' do
80-
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&amp;q=autolink"'
80+
it 'matches the full URL' do
81+
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&amp;q=autolink"'
82+
end
83+
end
84+
85+
context 'with unicode character' do
86+
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓&q=autolink' }
87+
88+
it 'matches the full URL' do
89+
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓&amp;q=autolink"'
90+
end
91+
end
92+
93+
context 'with unicode character at the end' do
94+
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓' }
95+
96+
it 'matches the full URL' do
97+
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓"'
98+
end
99+
end
100+
101+
context 'with escaped and not escaped unicode characters' do
102+
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink' }
103+
104+
it 'preserves escaped unicode characters' do
105+
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&amp;utf81=✓&amp;q=autolink"'
106+
end
81107
end
82108
end
83109

0 commit comments

Comments
 (0)