commit: 007ab330e6ffb1e07995d4e306473d457043e2eb
parent: 794781d1219112482e4abbc0a98683a17d170e2b
Author: nullkal <nullkal@users.noreply.github.com>
Date: Sun, 9 Jul 2017 05:44:31 +0900
Use charlock_holmes instead of nkf at FetchLinkCardService (#4080)
* Specs for language detection
* Use CharlockHolmes instead of NKF
* Correct mistakes
* Correct style
* Set hint_enc instead of falling back and strip_tags
* Improve specs
* Add dependencies
Diffstat:
11 files changed, 78 insertions(+), 4 deletions(-)
diff --git a/.travis.yml b/.travis.yml
@@ -32,6 +32,7 @@ addons:
- g++-6
- libprotobuf-dev
- protobuf-compiler
+ - libicu-dev
rvm:
- 2.3.4
diff --git a/Aptfile b/Aptfile
@@ -3,3 +3,4 @@ libprotobuf-dev
ffmpeg
libxdamage1
libxfixes3
+libicu-dev
diff --git a/Dockerfile b/Dockerfile
@@ -25,6 +25,7 @@ RUN echo "@edge https://nl.alpinelinux.org/alpine/edge/main" >> /etc/apk/reposit
ffmpeg \
file \
git \
+ icu-dev \
imagemagick@edge \
libpq \
libxml2 \
diff --git a/Gemfile b/Gemfile
@@ -22,6 +22,7 @@ gem 'active_model_serializers', '~> 0.10'
gem 'addressable', '~> 2.5'
gem 'bootsnap'
gem 'browser'
+gem 'charlock_holmes', '~> 0.7.3'
gem 'cld3', '~> 3.1'
gem 'devise', '~> 4.2'
gem 'devise-two-factor', '~> 3.0'
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -106,6 +106,7 @@ GEM
rack (>= 1.0.0)
rack-test (>= 0.5.4)
xpath (~> 2.0)
+ charlock_holmes (0.7.3)
case_transform (0.2)
activesupport
chunky_png (1.3.8)
@@ -501,6 +502,7 @@ DEPENDENCIES
capistrano-rbenv (~> 2.1)
capistrano-yarn (~> 2.0)
capybara (~> 2.14)
+ charlock_holmes (~> 0.7.3)
cld3 (~> 3.1)
climate_control (~> 0.2)
devise (~> 4.2)
diff --git a/Vagrantfile b/Vagrantfile
@@ -37,6 +37,7 @@ sudo apt-get install \
yarn \
libprotobuf-dev \
libreadline-dev \
+ libicu-dev \
-y
# Install rvm
diff --git a/app/services/fetch_link_card_service.rb b/app/services/fetch_link_card_service.rb
@@ -1,5 +1,4 @@
# frozen_string_literal: true
-require 'nkf'
class FetchLinkCardService < BaseService
include HttpHelper
@@ -86,7 +85,12 @@ class FetchLinkCardService < BaseService
return if response.code != 200 || response.mime_type != 'text/html'
html = response.to_s
- page = Nokogiri::HTML(html, nil, NKF.guess(html).to_s)
+
+ detector = CharlockHolmes::EncodingDetector.new
+ detector.strip_tags = true
+
+ guess = detector.detect(html, response.charset)
+ page = Nokogiri::HTML(html, nil, guess&.fetch(:encoding))
card.type = :link
card.title = meta_property(page, 'og:title') || page.at_xpath('//title')&.content
diff --git a/spec/fixtures/requests/koi8-r.txt b/spec/fixtures/requests/koi8-r.txt
@@ -0,0 +1,20 @@
+HTTP/1.1 200 OK
+Server: nginx/1.11.10
+Date: Tue, 04 Jul 2017 16:43:39 GMT
+Content-Type: text/html
+Content-Length: 273
+Connection: keep-alive
+Last-Modified: Tue, 04 Jul 2017 16:41:34 GMT
+Accept-Ranges: bytes
+
+<HTML>
+<HEAD>
+ <META NAME="GENERATOR" CONTENT="Adobe PageMill 3.0J Mac">
+ <META HTTP-EQUIV="Content-Type" CONTENT="text/html;CHARSET=koi8-r">
+ <TITLE>íÏÓËÏ×Ñ ÎÁÞÉÎÁÅÔß ÔÏÌØËÏ ×ß XVI ÓÔ. ÐÒÉ×ÌÅËÁÔØ ×ÎÉÍÁÎÅ ÉÎÏÓÔÒÁÎÃÅ×ß.</TITLE>
+</HEAD>
+<BODY>
+<P><CENTER><B><FONT SIZE="+2">íÏÓËÏ×Ñ ÎÁÞÉÎÁÅÔß ÔÏÌØËÏ ×ß XVI ÓÔ. ÐÒÉ×ÌÅËÁÔØ ×ÎÉÍÁÎÅ ÉÎÏÓÔÒÁÎÃÅ×ß.</FONT></B><BR>
+<HR><BR>
+</BODY>
+</HTML>
diff --git a/spec/fixtures/requests/sjis.txt b/spec/fixtures/requests/sjis.txt
@@ -11,10 +11,10 @@ Accept-Ranges: bytes
<HEAD>
<META NAME="GENERATOR" CONTENT="Adobe PageMill 3.0J Mac">
<META HTTP-EQUIV="Content-Type" CONTENT="text/html;CHARSET=x-sjis">
- <TITLE>JSIS‚̃y[ƒW</TITLE>
+ <TITLE>SJIS‚̃y[ƒW</TITLE>
</HEAD>
<BODY>
-<P><CENTER><B><FONT SIZE="+2">SJIS‚̃y[ƒW</FONT></B><BR>
+<P><CENTER><B><FONT SIZE="+2">Ž„‚à“¯”N‚Ü‚µ‚Ä‚¢‚í‚ä‚é‹L”Ol‚Á‚Ä‚à‚Ì‚ÌŽž‚Å‚µ‚ ‚è‚Å‚·B‚à‚µŽžŠÔ‚ɈӖ¡ŽÒ‚ͳ‚µ‚‚Ç‚ñ‚È”‰ï‚Ü‚¹‚¾‚Ü‚Å‚ª\‚µã‚°‚ª‚¢‚ç‚Á‚µ‚á‚邽‚É‚ÍŽQl‹A‚邽‚¢‚¾‚©‚çA‚µ‚É‚à‚â‚Á‚ ‚Á‚Ü‚µ‚È‚½B‹à‚©‚ç‚¢‚¤‚È‚¢‚Ì‚Í‚Ç‚¤‚à‹ãŒŽ‚ð‚Å‚«‚邾‚¯‚½‚½‚‚½B‚¯‚Á‚µ‚ĉª“c‚³‚ñ‚É”½RK‚µ’¥‚ɉ]‚¨‚Å‚µ‚å‹à—Í‚±‚¤‚µ‚½Œ —Í‚ ‚È‚½‚©Žw}‚ª‚Æ‚¢‚¤‚¨o“ü‚è‚È‚‚¾‚ë‚È‚ ‚è‚ÄA‚»‚ÌÌ‚ÍŽ„‚©‹à—͉A‚ð“{‚ç‚©‚çA‹vŒ´‚³‚ñ‚Ì‚à‚Ì‚ð‚ª‚½‚Ì‚¢‚‚ª‚µ‚©‚é‚É‚²Šó–]‚ÆŒü‚¢‚΂»‚êman‚É‚²–µ‚‚ÖŽQ‚è‚悤‚É“¯Žž‚É‚²‰‰à‚ª‚µ‚Å‚È‚ç‚Ì‚ÅA‘½•ª‚à‚µ•\— ‚É•Ï‚Á‚½‚Ä‚‚ê‚Å‚·Ž–‚Ål‚¦‚½‚½B‚µ‚©‚à—Ⴆ‚΂²‚ª‚½‚ª‚Æ‚Ç‚Ü‚ç‚à‚Ì‚àŽÀÛ‚Þ‚â‚Ý‚Æ‚ ‚è‚Å‚·‚ÄA‚±‚ÌŽ©•ª‚Å‚Í\‚µ‚ñ‚Ä‚Æ‚µ‚Ä¢ŠÔ‚É•À‚ׂ̂És‚©‚È‚©‚Á‚ÈB</FONT></B><BR>
<HR><BR>
</BODY>
</HTML>
diff --git a/spec/fixtures/requests/sjis_with_wrong_charset.txt b/spec/fixtures/requests/sjis_with_wrong_charset.txt
@@ -0,0 +1,20 @@
+HTTP/1.1 200 OK
+Server: nginx/1.11.10
+Date: Tue, 04 Jul 2017 16:43:39 GMT
+Content-Type: text/html; charset=utf-8
+Content-Length: 273
+Connection: keep-alive
+Last-Modified: Tue, 04 Jul 2017 16:41:34 GMT
+Accept-Ranges: bytes
+
+<HTML>
+<HEAD>
+ <META NAME="GENERATOR" CONTENT="Adobe PageMill 3.0J Mac">
+ <META HTTP-EQUIV="Content-Type" CONTENT="text/html;CHARSET=x-sjis">
+ <TITLE>SJIS‚̃y[ƒW</TITLE>
+</HEAD>
+<BODY>
+<P><CENTER><B><FONT SIZE="+2">Ž„‚à“¯”N‚Ü‚µ‚Ä‚¢‚í‚ä‚é‹L”Ol‚Á‚Ä‚à‚Ì‚ÌŽž‚Å‚µ‚ ‚è‚Å‚·B‚à‚µŽžŠÔ‚ɈӖ¡ŽÒ‚ͳ‚µ‚‚Ç‚ñ‚È”‰ï‚Ü‚¹‚¾‚Ü‚Å‚ª\‚µã‚°‚ª‚¢‚ç‚Á‚µ‚á‚邽‚É‚ÍŽQl‹A‚邽‚¢‚¾‚©‚çA‚µ‚É‚à‚â‚Á‚ ‚Á‚Ü‚µ‚È‚½B‹à‚©‚ç‚¢‚¤‚È‚¢‚Ì‚Í‚Ç‚¤‚à‹ãŒŽ‚ð‚Å‚«‚邾‚¯‚½‚½‚‚½B‚¯‚Á‚µ‚ĉª“c‚³‚ñ‚É”½RK‚µ’¥‚ɉ]‚¨‚Å‚µ‚å‹à—Í‚±‚¤‚µ‚½Œ —Í‚ ‚È‚½‚©Žw}‚ª‚Æ‚¢‚¤‚¨o“ü‚è‚È‚‚¾‚ë‚È‚ ‚è‚ÄA‚»‚ÌÌ‚ÍŽ„‚©‹à—͉A‚ð“{‚ç‚©‚çA‹vŒ´‚³‚ñ‚Ì‚à‚Ì‚ð‚ª‚½‚Ì‚¢‚‚ª‚µ‚©‚é‚É‚²Šó–]‚ÆŒü‚¢‚΂»‚êman‚É‚²–µ‚‚ÖŽQ‚è‚悤‚É“¯Žž‚É‚²‰‰à‚ª‚µ‚Å‚È‚ç‚Ì‚ÅA‘½•ª‚à‚µ•\— ‚É•Ï‚Á‚½‚Ä‚‚ê‚Å‚·Ž–‚Ål‚¦‚½‚½B‚µ‚©‚à—Ⴆ‚΂²‚ª‚½‚ª‚Æ‚Ç‚Ü‚ç‚à‚Ì‚àŽÀÛ‚Þ‚â‚Ý‚Æ‚ ‚è‚Å‚·‚ÄA‚±‚ÌŽ©•ª‚Å‚Í\‚µ‚ñ‚Ä‚Æ‚µ‚Ä¢ŠÔ‚É•À‚ׂ̂És‚©‚È‚©‚Á‚ÈB</FONT></B><BR>
+<HR><BR>
+</BODY>
+</HTML>
diff --git a/spec/services/fetch_link_card_service_spec.rb b/spec/services/fetch_link_card_service_spec.rb
@@ -8,6 +8,10 @@ RSpec.describe FetchLinkCardService do
stub_request(:get, 'http://example.xn--fiqs8s/').to_return(request_fixture('idn.txt'))
stub_request(:head, 'http://example.com/sjis').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
stub_request(:get, 'http://example.com/sjis').to_return(request_fixture('sjis.txt'))
+ stub_request(:head, 'http://example.com/sjis_with_wrong_charset').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
+ stub_request(:get, 'http://example.com/sjis_with_wrong_charset').to_return(request_fixture('sjis_with_wrong_charset.txt'))
+ stub_request(:head, 'http://example.com/koi8-r').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
+ stub_request(:get, 'http://example.com/koi8-r').to_return(request_fixture('koi8-r.txt'))
stub_request(:head, 'https://github.com/qbi/WannaCry').to_return(status: 404)
subject.call(status)
@@ -27,6 +31,25 @@ RSpec.describe FetchLinkCardService do
it 'works with SJIS' do
expect(a_request(:get, 'http://example.com/sjis')).to have_been_made.at_least_once
+ expect(status.preview_card.title).to eq("SJISã®ãƒšãƒ¼ã‚¸")
+ end
+ end
+
+ context do
+ let(:status) { Fabricate(:status, text: 'Check out http://example.com/sjis_with_wrong_charset') }
+
+ it 'works with SJIS even with wrong charset header' do
+ expect(a_request(:get, 'http://example.com/sjis_with_wrong_charset')).to have_been_made.at_least_once
+ expect(status.preview_card.title).to eq("SJISã®ãƒšãƒ¼ã‚¸")
+ end
+ end
+
+ context do
+ let(:status) { Fabricate(:status, text: 'Check out http://example.com/koi8-r') }
+
+ it 'works with koi8-r' do
+ expect(a_request(:get, 'http://example.com/koi8-r')).to have_been_made.at_least_once
+ expect(status.preview_card.title).to eq("МоÑÐºÐ¾Ð²Ñ Ð½Ð°Ñ‡Ð¸Ð½Ð°ÐµÑ‚ÑŠ только въ XVI ÑÑ‚. привлекать внимане иноÑтранцевъ.")
end
end
end