logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git

wikipedia.py (3752B)


  1. """
  2. Wikipedia (Web)
  3. @website https://{language}.wikipedia.org
  4. @provide-api yes
  5. @using-api yes
  6. @results JSON
  7. @stable yes
  8. @parse url, infobox
  9. """
  10. from json import loads
  11. from lxml.html import fromstring
  12. from searx.url_utils import quote, urlencode
  13. from searx.utils import match_language
  14. # search-url
  15. base_url = u'https://{language}.wikipedia.org/'
  16. search_url = base_url + u'w/api.php?'\
  17. 'action=query'\
  18. '&format=json'\
  19. '&{query}'\
  20. '&prop=extracts|pageimages'\
  21. '&exintro'\
  22. '&explaintext'\
  23. '&pithumbsize=300'\
  24. '&redirects'
  25. supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
  26. # set language in base_url
  27. def url_lang(lang):
  28. return match_language(lang, supported_languages).split('-')[0]
  29. # do search-request
  30. def request(query, params):
  31. if query.islower():
  32. query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8')
  33. params['url'] = search_url.format(query=urlencode({'titles': query}),
  34. language=url_lang(params['language']))
  35. return params
  36. # get first meaningful paragraph
  37. # this should filter out disambiguation pages and notes above first paragraph
  38. # "magic numbers" were obtained by fine tuning
  39. def extract_first_paragraph(content, title, image):
  40. first_paragraph = None
  41. failed_attempts = 0
  42. for paragraph in content.split('\n'):
  43. starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
  44. length = len(paragraph)
  45. if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
  46. first_paragraph = paragraph
  47. break
  48. failed_attempts += 1
  49. if failed_attempts > 3:
  50. return None
  51. return first_paragraph
  52. # get response from search-request
  53. def response(resp):
  54. results = []
  55. search_result = loads(resp.text)
  56. # wikipedia article's unique id
  57. # first valid id is assumed to be the requested article
  58. for article_id in search_result['query']['pages']:
  59. page = search_result['query']['pages'][article_id]
  60. if int(article_id) > 0:
  61. break
  62. if int(article_id) < 0:
  63. return []
  64. title = page.get('title')
  65. image = page.get('thumbnail')
  66. if image:
  67. image = image.get('source')
  68. extract = page.get('extract')
  69. summary = extract_first_paragraph(extract, title, image)
  70. # link to wikipedia article
  71. wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
  72. + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
  73. results.append({'url': wikipedia_link, 'title': title})
  74. results.append({'infobox': title,
  75. 'id': wikipedia_link,
  76. 'content': summary,
  77. 'img_src': image,
  78. 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
  79. return results
  80. # get supported languages from their site
  81. def _fetch_supported_languages(resp):
  82. supported_languages = {}
  83. dom = fromstring(resp.text)
  84. tables = dom.xpath('//table[contains(@class,"sortable")]')
  85. for table in tables:
  86. # exclude header row
  87. trs = table.xpath('.//tr')[1:]
  88. for tr in trs:
  89. td = tr.xpath('./td')
  90. code = td[3].xpath('./a')[0].text
  91. name = td[2].xpath('./a')[0].text
  92. english_name = td[1].xpath('./a')[0].text
  93. articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
  94. # exclude languages with too few articles
  95. if articles >= 100:
  96. supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
  97. return supported_languages