Skip to content

Commit 9e3822a

Browse files
Gargronjesseplusplus
authored andcommitted
Add support for structured data and more OpenGraph tags to link cards (mastodon#16938)
Save preview cards under their canonical URL Increase max redirects to follow from 2 to 3
1 parent d3fc7ed commit 9e3822a

5 files changed

Lines changed: 260 additions & 52 deletions

File tree

app/lib/link_details_extractor.rb

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
# frozen_string_literal: true
2+
3+
class LinkDetailsExtractor
4+
include ActionView::Helpers::TagHelper
5+
6+
class StructuredData
7+
def initialize(data)
8+
@data = data
9+
end
10+
11+
def headline
12+
json['headline']
13+
end
14+
15+
def description
16+
json['description']
17+
end
18+
19+
def image
20+
obj = first_of_value(json['image'])
21+
22+
return obj['url'] if obj.is_a?(Hash)
23+
24+
obj
25+
end
26+
27+
def date_published
28+
json['datePublished']
29+
end
30+
31+
def date_modified
32+
json['dateModified']
33+
end
34+
35+
def author_name
36+
author['name']
37+
end
38+
39+
def author_url
40+
author['url']
41+
end
42+
43+
def publisher_name
44+
publisher['name']
45+
end
46+
47+
private
48+
49+
def author
50+
first_of_value(json['author']) || {}
51+
end
52+
53+
def publisher
54+
first_of_value(json['publisher']) || {}
55+
end
56+
57+
def first_of_value(arr)
58+
arr.is_a?(Array) ? arr.first : arr
59+
end
60+
61+
def json
62+
@json ||= Oj.load(@data)
63+
end
64+
end
65+
66+
def initialize(original_url, html, html_charset)
67+
@original_url = Addressable::URI.parse(original_url)
68+
@html = html
69+
@html_charset = html_charset
70+
end
71+
72+
def to_preview_card_attributes
73+
{
74+
title: title || '',
75+
description: description || '',
76+
image_remote_url: image,
77+
type: type,
78+
width: width || 0,
79+
height: height || 0,
80+
html: html || '',
81+
provider_name: provider_name || '',
82+
provider_url: provider_url || '',
83+
author_name: author_name || '',
84+
author_url: author_url || '',
85+
embed_url: embed_url || '',
86+
}
87+
end
88+
89+
def type
90+
player_url.present? ? :video : :link
91+
end
92+
93+
def html
94+
player_url.present? ? content_tag(:iframe, src: player_url, width: width, height: height, allowtransparency: 'true', scrolling: 'no', frameborder: '0') : nil
95+
end
96+
97+
def width
98+
opengraph_tag('twitter:player:width')
99+
end
100+
101+
def height
102+
opengraph_tag('twitter:player:height')
103+
end
104+
105+
def title
106+
structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first
107+
end
108+
109+
def description
110+
structured_data&.description || opengraph_tag('og:description') || meta_tag('description')
111+
end
112+
113+
def image
114+
valid_url_or_nil(opengraph_tag('og:image'))
115+
end
116+
117+
def canonical_url
118+
valid_url_or_nil(opengraph_tag('og:url') || link_tag('canonical'), same_origin_only: true) || @original_url.to_s
119+
end
120+
121+
def provider_name
122+
structured_data&.publisher_name || opengraph_tag('og:site_name')
123+
end
124+
125+
def provider_url
126+
valid_url_or_nil(host_to_url(opengraph_tag('og:site')))
127+
end
128+
129+
def author_name
130+
structured_data&.author_name || opengraph_tag('og:author') || opengraph_tag('og:author:username')
131+
end
132+
133+
def author_url
134+
structured_data&.author_url
135+
end
136+
137+
def embed_url
138+
valid_url_or_nil(opengraph_tag('twitter:player:stream'))
139+
end
140+
141+
private
142+
143+
def player_url
144+
valid_url_or_nil(opengraph_tag('twitter:player'))
145+
end
146+
147+
def host_to_url(str)
148+
return if str.blank?
149+
150+
str.start_with?(/https?:\/\//) ? str : "http://#{str}"
151+
end
152+
153+
def valid_url_or_nil(str, same_origin_only: false)
154+
return if str.blank?
155+
156+
url = @original_url + Addressable::URI.parse(str)
157+
158+
return if url.host.blank? || !%w(http https).include?(url.scheme) || (same_origin_only && url.host != @original_url.host)
159+
160+
url.to_s
161+
rescue Addressable::URI::InvalidURIError
162+
nil
163+
end
164+
165+
def link_tag(name)
166+
document.xpath("//link[@rel=\"#{name}\"]").map { |link| link['href'] }.first
167+
end
168+
169+
def opengraph_tag(name)
170+
document.xpath("//meta[@property=\"#{name}\" or @name=\"#{name}\"]").map { |meta| meta['content'] }.first
171+
end
172+
173+
def meta_tag(name)
174+
document.xpath("//meta[@name=\"#{name}\"]").map { |meta| meta['content'] }.first
175+
end
176+
177+
def structured_data
178+
@structured_data ||= begin
179+
json_ld = document.xpath('//script[@type="application/ld+json"]').map(&:content).first
180+
json_ld.present? ? StructuredData.new(json_ld) : nil
181+
end
182+
end
183+
184+
def document
185+
@document ||= Nokogiri::HTML(@html, nil, encoding)
186+
end
187+
188+
def encoding
189+
@encoding ||= begin
190+
guess = detector.detect(@html, @html_charset)
191+
guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil
192+
end
193+
end
194+
195+
def detector
196+
@detector ||= CharlockHolmes::EncodingDetector.new.tap do |detector|
197+
detector.strip_tags = true
198+
end
199+
end
200+
end

app/lib/request.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def valid_url?(url)
9494
end
9595

9696
def http_client
97-
HTTP.use(:auto_inflate).timeout(TIMEOUT.dup).follow(max_hops: 2)
97+
HTTP.use(:auto_inflate).timeout(TIMEOUT.dup).follow(max_hops: 3)
9898
end
9999
end
100100

app/services/fetch_link_card_service.rb

Lines changed: 29 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@ class FetchLinkCardService < BaseService
1313
}iox
1414

1515
def call(status)
16-
@status = status
17-
@url = parse_urls
16+
@status = status
17+
@original_url = parse_urls
1818

19-
return if @url.nil? || @status.preview_cards.any?
19+
return if @original_url.nil? || @status.preview_cards.any?
2020

21-
@url = @url.to_s
21+
@url = @original_url.to_s
2222

2323
RedisLock.acquire(lock_options) do |lock|
2424
if lock.acquired?
@@ -31,7 +31,7 @@ def call(status)
3131

3232
attach_card if @card&.persisted?
3333
rescue HTTP::Error, OpenSSL::SSL::SSLError, Addressable::URI::InvalidURIError, Mastodon::HostValidationError, Mastodon::LengthValidationError => e
34-
Rails.logger.debug "Error fetching link #{@url}: #{e}"
34+
Rails.logger.debug "Error fetching link #{@original_url}: #{e}"
3535
nil
3636
end
3737

@@ -47,6 +47,12 @@ def html
4747
return @html if defined?(@html)
4848

4949
Request.new(:get, @url).add_headers('Accept' => 'text/html', 'User-Agent' => Mastodon::Version.user_agent + ' Bot').perform do |res|
50+
# We follow redirects, and ideally we want to save the preview card for
51+
# the destination URL and not any link shortener in-between, so here
52+
# we set the URL to the one of the last response in the redirect chain
53+
@url = res.request.uri.to_s.to_s
54+
@card = PreviewCard.find_or_initialize_by(url: @url) if @card.url != @url
55+
5056
if res.code == 200 && res.mime_type == 'text/html'
5157
@html_charset = res.charset
5258
@html = res.body_with_limit
@@ -63,12 +69,15 @@ def attach_card
6369
end
6470

6571
def parse_urls
66-
if @status.local?
67-
urls = @status.text.scan(URL_PATTERN).map { |array| Addressable::URI.parse(array[1]).normalize }
68-
else
69-
html = Nokogiri::HTML(@status.text)
70-
links = html.css('a')
71-
urls = links.filter_map { |a| Addressable::URI.parse(a['href']) unless skip_link?(a) }.filter_map(&:normalize)
72+
urls = begin
73+
if @status.local?
74+
@status.text.scan(URL_PATTERN).map { |array| Addressable::URI.parse(array[1]).normalize }
75+
else
76+
document = Nokogiri::HTML(@status.text)
77+
links = document.css('a')
78+
79+
links.filter_map { |a| Addressable::URI.parse(a['href']) unless skip_link?(a) }.filter_map(&:normalize)
80+
end
7281
end
7382

7483
urls.reject { |uri| bad_url?(uri) }.first
@@ -79,18 +88,16 @@ def bad_url?(uri)
7988
uri.host.blank? || TagManager.instance.local_url?(uri.to_s) || !%w(http https).include?(uri.scheme)
8089
end
8190

82-
# rubocop:disable Naming/MethodParameterName
83-
def mention_link?(a)
91+
def mention_link?(anchor)
8492
@status.mentions.any? do |mention|
85-
a['href'] == ActivityPub::TagManager.instance.url_for(mention.account)
93+
anchor['href'] == ActivityPub::TagManager.instance.url_for(mention.account)
8694
end
8795
end
8896

89-
def skip_link?(a)
97+
def skip_link?(anchor)
9098
# Avoid links for hashtags and mentions (microformats)
91-
a['rel']&.include?('tag') || a['class']&.match?(/u-url|h-card/) || mention_link?(a)
99+
anchor['rel']&.include?('tag') || anchor['class']&.match?(/u-url|h-card/) || mention_link?(anchor)
92100
end
93-
# rubocop:enable Naming/MethodParameterName
94101

95102
def attempt_oembed
96103
service = FetchOEmbedService.new
@@ -139,42 +146,14 @@ def attempt_oembed
139146
def attempt_opengraph
140147
return if html.nil?
141148

142-
detector = CharlockHolmes::EncodingDetector.new
143-
detector.strip_tags = true
144-
145-
guess = detector.detect(@html, @html_charset)
146-
encoding = guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil
147-
page = Nokogiri::HTML(@html, nil, encoding)
148-
player_url = meta_property(page, 'twitter:player')
149-
150-
if player_url && !bad_url?(Addressable::URI.parse(player_url))
151-
@card.type = :video
152-
@card.width = meta_property(page, 'twitter:player:width') || 0
153-
@card.height = meta_property(page, 'twitter:player:height') || 0
154-
@card.html = content_tag(:iframe, nil, src: player_url,
155-
width: @card.width,
156-
height: @card.height,
157-
allowtransparency: 'true',
158-
scrolling: 'no',
159-
frameborder: '0')
160-
else
161-
@card.type = :link
162-
end
163-
164-
@card.title = meta_property(page, 'og:title').presence || page.at_xpath('//title')&.content || ''
165-
@card.description = meta_property(page, 'og:description').presence || meta_property(page, 'description') || ''
166-
@card.image_remote_url = (Addressable::URI.parse(@url) + meta_property(page, 'og:image')).to_s if meta_property(page, 'og:image')
167-
168-
return if @card.title.blank? && @card.html.blank?
169-
170-
@card.save_with_optional_image!
171-
end
149+
link_details_extractor = LinkDetailsExtractor.new(@url, @html, @html_charset)
172150

173-
def meta_property(page, property)
174-
page.at_xpath("//meta[contains(concat(' ', normalize-space(@property), ' '), ' #{property} ')]")&.attribute('content')&.value || page.at_xpath("//meta[@name=\"#{property}\"]")&.attribute('content')&.value
151+
@card = PreviewCard.find_or_initialize_by(url: link_details_extractor.canonical_url) if link_details_extractor.canonical_url != @card.url
152+
@card.assign_attributes(link_details_extractor.to_preview_card_attributes)
153+
@card.save_with_optional_image! unless @card.title.blank? && @card.html.blank?
175154
end
176155

177156
def lock_options
178-
{ redis: Redis.current, key: "fetch:#{@url}", autorelease: 15.minutes.seconds }
157+
{ redis: Redis.current, key: "fetch:#{@original_url}", autorelease: 15.minutes.seconds }
179158
end
180159
end
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
require 'rails_helper'
2+
3+
RSpec.describe LinkDetailsExtractor do
4+
let(:original_url) { '' }
5+
let(:html) { '' }
6+
let(:html_charset) { nil }
7+
8+
subject { described_class.new(original_url, html, html_charset) }
9+
10+
describe '#canonical_url' do
11+
let(:original_url) { 'https://foo.com/article?bar=baz123' }
12+
13+
context 'when canonical URL points to another host' do
14+
let(:html) { '<!doctype html><link rel="canonical" href="https://bar.com/different-article" />' }
15+
16+
it 'ignores the canonical URLs' do
17+
expect(subject.canonical_url).to eq original_url
18+
end
19+
end
20+
21+
context 'when canonical URL points to the same host' do
22+
let(:html) { '<!doctype html><link rel="canonical" href="https://foo.com/article" />' }
23+
24+
it 'ignores the canonical URLs' do
25+
expect(subject.canonical_url).to eq 'https://foo.com/article'
26+
end
27+
end
28+
end
29+
end

spec/services/fetch_link_card_service_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
require 'rails_helper'
22

33
RSpec.describe FetchLinkCardService, type: :service do
4-
subject { FetchLinkCardService.new }
4+
subject { described_class.new }
55

66
before do
77
stub_request(:get, 'http://example.xn--fiqs8s/').to_return(request_fixture('idn.txt'))

0 commit comments

Comments
 (0)