logo

mastofe

My custom branche(s) on git.pleroma.social/pleroma/mastofe
commit: d010e270e613f6299397601289158bd2acedbe8e
parent: d1e08bd38c029f0b47dfd2f3ba61ca5bb3e414b8
Author: Matt Jankowski <mjankowski@thoughtbot.com>
Date:   Thu,  1 Jun 2017 09:29:14 -0400

Remove usernames and hashtags from language detection (#3503)

* Add failing specs for hashtag and username extraction in language detector

* Remove usernames and hashtags from text before language detection

* Handle multiple instances of special case, and reduce whitespace

Diffstat:

Mapp/lib/language_detector.rb11+++++++++--
Mspec/lib/language_detector_spec.rb38++++++++++++++++++++++++++++++++++++++
2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb @@ -13,6 +13,10 @@ class LanguageDetector detected_language_code || default_locale.to_sym end + def prepared_text + simplified_text.strip + end + private def detected_language_code @@ -20,18 +24,21 @@ class LanguageDetector end def result - @result ||= @identifier.find_language(text_without_urls) + @result ||= @identifier.find_language(prepared_text) end def detected_language_reliable? result.reliable? end - def text_without_urls + def simplified_text text.dup.tap do |new_text| URI.extract(new_text).each do |url| new_text.gsub!(url, '') end + new_text.gsub!(Account::MENTION_RE, '') + new_text.gsub!(Tag::HASHTAG_RE, '') + new_text.gsub!(/\s+/, ' ') end end diff --git a/spec/lib/language_detector_spec.rb b/spec/lib/language_detector_spec.rb @@ -1,7 +1,45 @@ # frozen_string_literal: true + require 'rails_helper' describe LanguageDetector do + describe 'prepared_text' do + it 'returns unmodified string without special cases' do + string = 'just a regular string' + result = described_class.new(string).prepared_text + + expect(result).to eq string + end + + it 'collapses spacing in strings' do + string = 'The formatting in this is very odd' + + result = described_class.new(string).prepared_text + expect(result).to eq 'The formatting in this is very odd' + end + + it 'strips usernames from strings before detection' do + string = '@username Yeah, very surreal...! also @friend' + + result = described_class.new(string).prepared_text + expect(result).to eq 'Yeah, very surreal...! also' + end + + it 'strips URLs from strings before detection' do + string = 'Our website is https://example.com and also http://localhost.dev' + + result = described_class.new(string).prepared_text + expect(result).to eq 'Our website is and also' + end + + it 'strips #hashtags from strings before detection' do + string = 'Hey look at all the #animals and #fish' + + result = described_class.new(string).prepared_text + expect(result).to eq 'Hey look at all the and' + end + end + describe 'to_iso_s' do it 'detects english language for basic strings' do strings = [