commit: ad207456d64c76d21c17b26a954b459fe2dc0f54
parent: 9e3d24a150acf118789461797735fc0e4a8a30ea
Author: Renato "Lond" Cerqueira <renato@lond.com.br>
Date: Thu, 16 Nov 2017 10:51:38 -0200
Improve language filter (#5724)
* Scrub text of html before detecting language.
* Detect language on statuses coming from activitypub.
* Fix rubocop comments.
* Remove custom emoji from text before language detection
Diffstat:
2 files changed, 26 insertions(+), 7 deletions(-)
diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb
@@ -173,7 +173,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity
end
def language_from_content
- return nil unless language_map?
+ return LanguageDetector.instance.detect(text_from_content, @account) unless language_map?
@object['contentMap'].keys.first
end
diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb
@@ -38,12 +38,31 @@ class LanguageDetector
end
def simplify_text(text)
- text.dup.tap do |new_text|
- new_text.gsub!(FetchLinkCardService::URL_PATTERN, '')
- new_text.gsub!(Account::MENTION_RE, '')
- new_text.gsub!(Tag::HASHTAG_RE, '')
- new_text.gsub!(/\s+/, ' ')
- end
+ new_text = remove_html(text)
+ new_text.gsub!(FetchLinkCardService::URL_PATTERN, '')
+ new_text.gsub!(Account::MENTION_RE, '')
+ new_text.gsub!(Tag::HASHTAG_RE, '')
+ new_text.gsub!(/:#{CustomEmoji::SHORTCODE_RE_FRAGMENT}:/, '')
+ new_text.gsub!(/\s+/, ' ')
+ new_text
+ end
+
+ def new_scrubber
+ scrubber = Rails::Html::PermitScrubber.new
+ scrubber.tags = %w(br p)
+ scrubber
+ end
+
+ def scrubber
+ @scrubber ||= new_scrubber
+ end
+
+ def remove_html(text)
+ text = Loofah.fragment(text).scrub!(scrubber).to_s
+ text.gsub!('<br>', "\n")
+ text.gsub!('</p><p>', "\n\n")
+ text.gsub!(/(^<p>|<\/p>$)/, '')
+ text
end
def default_locale(account)