commit: 3816943e6b5e86b22c35f3c068521f7a9007deec
parent: b39d512ade9f556ae29d60239102faf67ff6a89f
Author: ふぁぼ原 <ko_kurihara@yahoo.co.jp>
Date: Fri, 15 Sep 2017 01:03:20 +0900
Enable to recognize most kinds of characters as URL paths (#4941)
Diffstat:
5 files changed, 96 insertions(+), 5 deletions(-)
diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb
@@ -131,7 +131,7 @@ class Formatter
end
def link_html(url)
- url = Addressable::URI.parse(url).display_uri.to_s
+ url = Addressable::URI.parse(url).to_s
prefix = url.match(/\Ahttps?:\/\/(www\.)?/).to_s
text = url[prefix.length, 30]
suffix = url[prefix.length + 30..-1]
diff --git a/app/services/fetch_link_card_service.rb b/app/services/fetch_link_card_service.rb
@@ -1,9 +1,15 @@
# frozen_string_literal: true
class FetchLinkCardService < BaseService
- include ActionView::Helpers::TagHelper
-
- URL_PATTERN = %r{https?://\S+}
+ URL_PATTERN = %r{
+ ( # $1 URL
+ (https?:\/\/)? # $2 Protocol (optional)
+ (#{Twitter::Regex[:valid_domain]}) # $3 Domain(s)
+ (?::(#{Twitter::Regex[:valid_port_number]}))? # $4 Port number (optional)
+ (/#{Twitter::Regex[:valid_url_path]}*)? # $5 URL Path and anchor
+ (\?#{Twitter::Regex[:valid_url_query_chars]}*#{Twitter::Regex[:valid_url_query_ending_chars]})? # $6 Query String
+ )
+ }iox
def call(status)
@status = status
@@ -42,7 +48,7 @@ class FetchLinkCardService < BaseService
def parse_urls
if @status.local?
- urls = @status.text.match(URL_PATTERN).to_a.map { |uri| Addressable::URI.parse(uri).normalize }
+ urls = @status.text.scan(URL_PATTERN).map { |array| Addressable::URI.parse(array[0]).normalize }
else
html = Nokogiri::HTML(@status.text)
links = html.css('a')
diff --git a/config/initializers/twitter_regex.rb b/config/initializers/twitter_regex.rb
@@ -0,0 +1,42 @@
+module Twitter
+ class Regex
+
+ REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}\(\)\?]/iou
+ REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*';:=\,\.\$%\[\]\p{Pd}_~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
+ REGEXEN[:valid_url_balanced_parens] = /
+ \(
+ (?:
+ #{REGEXEN[:valid_general_url_path_chars]}+
+ |
+ # allow one nested level of balanced parentheses
+ (?:
+ #{REGEXEN[:valid_general_url_path_chars]}*
+ \(
+ #{REGEXEN[:valid_general_url_path_chars]}+
+ \)
+ #{REGEXEN[:valid_general_url_path_chars]}*
+ )
+ )
+ \)
+ /iox
+ REGEXEN[:valid_url_path] = /(?:
+ (?:
+ #{REGEXEN[:valid_general_url_path_chars]}*
+ (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
+ #{REGEXEN[:valid_url_path_ending_chars]}
+ )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
+ )/iox
+ REGEXEN[:valid_url] = %r{
+ ( # $1 total match
+ (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
+ ( # $3 URL
+ (https?:\/\/)? # $4 Protocol (optional)
+ (#{REGEXEN[:valid_domain]}) # $5 Domain(s)
+ (?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
+ (/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
+ (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
+ )
+ )
+ }iox
+ end
+end
diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb
@@ -89,6 +89,38 @@ RSpec.describe Formatter do
end
end
+ context 'matches a URL with Japanese path string' do
+ let(:text) { 'https://ja.wikipedia.org/wiki/日本' }
+
+ it 'has valid URL' do
+ is_expected.to include 'href="https://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC"'
+ end
+ end
+
+ context 'matches a URL with Korean path string' do
+ let(:text) { 'https://ko.wikipedia.org/wiki/대한민국' }
+
+ it 'has valid URL' do
+ is_expected.to include 'href="https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD"'
+ end
+ end
+
+ context 'matches a URL with Simplified Chinese path string' do
+ let(:text) { 'https://baike.baidu.com/item/中华人民共和国' }
+
+ it 'has valid URL' do
+ is_expected.to include 'href="https://baike.baidu.com/item/%E4%B8%AD%E5%8D%8E%E4%BA%BA%E6%B0%91%E5%85%B1%E5%92%8C%E5%9B%BD"'
+ end
+ end
+
+ context 'matches a URL with Traditional Chinese path string' do
+ let(:text) { 'https://zh.wikipedia.org/wiki/臺灣' }
+
+ it 'has valid URL' do
+ is_expected.to include 'href="https://zh.wikipedia.org/wiki/%E8%87%BA%E7%81%A3"'
+ end
+ end
+
context 'contains HTML (script tag)' do
let(:text) { '<script>alert("Hello")</script>' }
diff --git a/spec/services/fetch_link_card_service_spec.rb b/spec/services/fetch_link_card_service_spec.rb
@@ -12,6 +12,8 @@ RSpec.describe FetchLinkCardService do
stub_request(:get, 'http://example.com/sjis_with_wrong_charset').to_return(request_fixture('sjis_with_wrong_charset.txt'))
stub_request(:head, 'http://example.com/koi8-r').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
stub_request(:get, 'http://example.com/koi8-r').to_return(request_fixture('koi8-r.txt'))
+ stub_request(:head, 'http://example.com/日本語').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
+ stub_request(:get, 'http://example.com/日本語').to_return(request_fixture('sjis.txt'))
stub_request(:head, 'https://github.com/qbi/WannaCry').to_return(status: 404)
subject.call(status)
@@ -52,6 +54,15 @@ RSpec.describe FetchLinkCardService do
expect(status.preview_cards.first.title).to eq("Московя начинаетъ только въ XVI ст. привлекать внимане иностранцевъ.")
end
end
+
+ context do
+ let(:status) { Fabricate(:status, text: 'テストhttp://example.com/日本語') }
+
+ it 'works with Japanese path string' do
+ expect(a_request(:get, 'http://example.com/日本語')).to have_been_made.at_least_once
+ expect(status.preview_cards.first.title).to eq("SJISのページ")
+ end
+ end
end
context 'in a remote status' do