commit: a11948c71bfe7b2aac6e50e7634874d5073c7d84
parent a11230819ff76312b4528b2bfb6e03d0560075b1
Author: marc <a01200356@itesm.mx>
Date: Sat, 29 Oct 2016 21:04:01 -0500
Add language support for more engines.
Diffstat:
15 files changed, 66 insertions(+), 39 deletions(-)
diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py
@@ -20,6 +20,24 @@ from datetime import datetime
categories = ['videos']
paging = True
language_support = True
+supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az",
+ "ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca",
+ "cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da",
+ "de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo",
+ "fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv",
+ "gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr",
+ "hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik",
+ "is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk",
+ "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo",
+ "la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml",
+ "mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv",
+ "nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc",
+ "oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu",
+ "rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se",
+ "sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss",
+ "su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th",
+ "ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur",
+ "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"]
# search-url
# see http://www.dailymotion.com/doc/api/obj-video.html
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py
@@ -16,7 +16,6 @@
from urllib import urlencode
from lxml.html import fromstring
from searx.engines.xpath import extract_text
-from searx.languages import language_codes
# engine dependent config
categories = ['general']
@@ -76,26 +75,7 @@ def request(query, params):
else:
# tries to get a country code from language
locale = locale[0].lower()
- lang_codes = [x[0] for x in language_codes]
- for lc in lang_codes:
- lc = lc.split('-')
- if locale == lc[0] and len(lc) == 2:
- locale = lc[1].lower() + '-' + lc[0].lower()
- break
-
- if locale:
- params['url'] = url.format(
- query=urlencode({'q': query, 'kl': locale}), offset=offset)
- else:
- locale = params['language'].split('-')
- if len(locale) == 2:
- # country code goes first
- locale = locale[1].lower() + '-' + locale[0].lower()
- else:
- # tries to get a country code from language
- locale = locale[0].lower()
- lang_codes = [x[0] for x in language_codes]
- for lc in lang_codes:
+ for lc in supported_languages:
lc = lc.split('-')
if locale == lc[0]:
locale = lc[1].lower() + '-' + lc[0].lower()
diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py
@@ -44,7 +44,7 @@ supported_languages = ["en", "fr", "es", "ru", "tr", "ja", "zh-CN", "zh-TW", "ko
"nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el",
"th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr",
"hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv",
- "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"]
+ "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"]
# do search-request
diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py
@@ -20,6 +20,11 @@ from searx.utils import html_to_text
categories = None
paging = True
language_support = True
+supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT",
+ "fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU",
+ "el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR",
+ "es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH",
+ "th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"]
category_to_keyword = {'general': 'web',
'images': 'images',
@@ -46,7 +51,15 @@ def request(query, params):
# add language tag if specified
if params['language'] != 'all':
- params['url'] += '&locale=' + params['language'].lower()
+ locale = params['language'].split('-')
+ if len(locale) == 2 and params['language'] in supported_languages:
+ params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
+ else:
+ # try to get a country code for language
+ for lang in supported_languages:
+ if locale[0] == lang.split('-')[0]:
+ params['url'] += '&locale=' + lang.replace('-', '_').lower()
+ break
return params
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
@@ -24,6 +24,11 @@ categories = ['general']
# paging = False
language_support = True
+supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW",
+ "ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr",
+ "el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja",
+ "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw",
+ "sv", "tl", "th", "tr", "uk", "vi"]
# search-url
base_url = 'https://startpage.com/'
diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py
@@ -18,6 +18,12 @@ import re
categories = ['general', 'images']
paging = True
language_support = True
+supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA",
+ "es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR",
+ "zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT",
+ "en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU",
+ "en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH",
+ "tr-TR", "uk-UA", "en-GB", "en-US", "es-US"]
# search-url
base_url = 'https://swisscows.ch/'
@@ -35,6 +41,8 @@ def request(query, params):
if params['language'] == 'all':
ui_language = 'browser'
region = 'browser'
+ elif params['language'].split('-')[0] == 'no':
+ region = 'nb-NO'
else:
region = params['language']
ui_language = params['language'].split('-')[0]
diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py
@@ -22,7 +22,9 @@ language_support = True # TODO
default_tld = 'com'
language_map = {'ru': 'ru',
- 'ua': 'uk',
+ 'ua': 'ua',
+ 'be': 'by',
+ 'kk': 'kz',
'tr': 'com.tr'}
# search-url
diff --git a/searx/languages.py b/searx/languages.py
@@ -100,7 +100,7 @@ language_codes = (
(u"sa", u"संस्कृतम्", u"", u"Sanskrit"),
(u"he-IL", u"עברית", u"", u"Hebrew"),
(u"se", u"Sámegiella", u"", u"Northern Sami"),
- (u"sd", u"سنڌي، سندھی ، सिन्ध", u"", u"Sindhi"),
+ (u"sd", u"سنڌي ،सिन्ध", u"", u"Sindhi"),
(u"fr-CH", u"Français", u"", u"French"),
(u"zea", u"Zeêuws", u"", u"Zeelandic"),
(u"it-CH", u"Italiano", u"", u"Italian"),
@@ -191,6 +191,7 @@ language_codes = (
(u"jam", u"Jamaican Creole English", u"", u"Patois"),
(u"udm", u"Удмурт кыл", u"", u"Udmurt"),
(u"ksh", u"Ripoarisch", u"", u"Ripuarian"),
+ (u"sl-SI", u"Slovenščina", u"", u"Slovenian"),
(u"ms-MY", u"Bahasa Melayu", u"", u"Malay"),
(u"de", u"Deutsch", u"", u"German"),
(u"da", u"Dansk", u"", u"Danish"),
@@ -284,6 +285,7 @@ language_codes = (
(u"mhr", u"Олык Марий (Olyk Marij)", u"", u"Meadow Mari"),
(u"ca-CT", u"Català", u"", u"Catalan"),
(u"en-MY", u"English", u"", u"English"),
+ (u"olo", u"Livvi-Karelian", u"", u"Livvinkarjala"),
(u"sv-SE", u"Svenska", u"", u"Swedish"),
(u"de-AT", u"Deutsch", u"", u"German"),
(u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"),
diff --git a/tests/unit/engines/test_duckduckgo.py b/tests/unit/engines/test_duckduckgo.py
@@ -11,7 +11,7 @@ class TestDuckduckgoEngine(SearxTestCase):
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
- dicto['language'] = 'de_CH'
+ dicto['language'] = 'de-CH'
dicto['time_range'] = ''
params = duckduckgo.request(query, dicto)
self.assertIn('url', params)
diff --git a/tests/unit/engines/test_duckduckgo_definitions.py b/tests/unit/engines/test_duckduckgo_definitions.py
@@ -21,10 +21,14 @@ class TestDDGDefinitionsEngine(SearxTestCase):
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
+ dicto['language'] = 'es'
params = duckduckgo_definitions.request(query, dicto)
self.assertIn('url', params)
self.assertIn(query, params['url'])
self.assertIn('duckduckgo.com', params['url'])
+ self.assertIn('headers', params)
+ self.assertIn('Accept-Language', params['headers'])
+ self.assertIn('es', params['headers']['Accept-Language'])
def test_response(self):
self.assertRaises(AttributeError, duckduckgo_definitions.response, None)
diff --git a/tests/unit/engines/test_google.py b/tests/unit/engines/test_google.py
@@ -18,7 +18,7 @@ class TestGoogleEngine(SearxTestCase):
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
- dicto['language'] = 'fr_FR'
+ dicto['language'] = 'fr-FR'
dicto['time_range'] = ''
params = google.request(query, dicto)
self.assertIn('url', params)
diff --git a/tests/unit/engines/test_qwant.py b/tests/unit/engines/test_qwant.py
@@ -10,7 +10,7 @@ class TestQwantEngine(SearxTestCase):
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 0
- dicto['language'] = 'fr_FR'
+ dicto['language'] = 'fr-FR'
qwant.categories = ['']
params = qwant.request(query, dicto)
self.assertIn('url', params)
diff --git a/tests/unit/engines/test_swisscows.py b/tests/unit/engines/test_swisscows.py
@@ -10,7 +10,7 @@ class TestSwisscowsEngine(SearxTestCase):
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
- dicto['language'] = 'de_DE'
+ dicto['language'] = 'de-DE'
params = swisscows.request(query, dicto)
self.assertTrue('url' in params)
self.assertTrue(query in params['url'])
diff --git a/tests/unit/engines/test_wikipedia.py b/tests/unit/engines/test_wikipedia.py
@@ -10,7 +10,7 @@ class TestWikipediaEngine(SearxTestCase):
def test_request(self):
query = 'test_query'
dicto = defaultdict(dict)
- dicto['language'] = 'fr_FR'
+ dicto['language'] = 'fr-FR'
params = wikipedia.request(query, dicto)
self.assertIn('url', params)
self.assertIn(query, params['url'])
diff --git a/utils/update_languages.py b/utils/update_languages.py
@@ -41,7 +41,6 @@ def valid_code(lang_code):
if len(lang_code) > 2 or len(lang_code[0]) > 3:
return False
if len(lang_code) == 2 and len(lang_code[1]) > 2:
- print lang_code
return False
return True
@@ -62,8 +61,8 @@ def get_wikipedia_languages():
english_name = td[1].xpath('./a')[0].text
articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
- # exclude languages with few articles and language variants
- if code not in languages and articles >= 100 and valid_code(code):
+ # exclude language variants and languages with few articles
+ if code not in languages and articles >= 1000 and valid_code(code):
languages[code] = (name, '', english_name)
@@ -90,7 +89,7 @@ def join_language_lists():
# try to get language name
language = languages.get(locale.split('-')[0], None)
if language == None:
- # print engine_name + ": " + locale
+ print engine_name + ": " + locale
continue
(name, country, english) = language
@@ -117,12 +116,8 @@ def write_languages_file():
new_file.close()
-def main():
+if __name__ == "__main__":
get_wikipedia_languages()
get_google_languages()
join_language_lists()
write_languages_file()
-
-
-if __name__ == "__main__":
- main()