logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git

duckduckgo_definitions.py (5666B)


  1. import json
  2. from lxml import html
  3. from re import compile
  4. from searx.engines.xpath import extract_text
  5. from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
  6. from searx.url_utils import urlencode
  7. from searx.utils import html_to_text, match_language
  8. url = 'https://api.duckduckgo.com/'\
  9. + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
  10. http_regex = compile(r'^http:')
  11. def result_to_text(url, text, htmlResult):
  12. # TODO : remove result ending with "Meaning" or "Category"
  13. dom = html.fromstring(htmlResult)
  14. a = dom.xpath('//a')
  15. if len(a) >= 1:
  16. return extract_text(a[0])
  17. else:
  18. return text
  19. def request(query, params):
  20. params['url'] = url.format(query=urlencode({'q': query}))
  21. language = match_language(params['language'], supported_languages, language_aliases)
  22. params['headers']['Accept-Language'] = language.split('-')[0]
  23. return params
  24. def response(resp):
  25. results = []
  26. search_res = json.loads(resp.text)
  27. content = ''
  28. heading = search_res.get('Heading', '')
  29. attributes = []
  30. urls = []
  31. infobox_id = None
  32. relatedTopics = []
  33. # add answer if there is one
  34. answer = search_res.get('Answer', '')
  35. if answer != '':
  36. results.append({'answer': html_to_text(answer)})
  37. # add infobox
  38. if 'Definition' in search_res:
  39. content = content + search_res.get('Definition', '')
  40. if 'Abstract' in search_res:
  41. content = content + search_res.get('Abstract', '')
  42. # image
  43. image = search_res.get('Image', '')
  44. image = None if image == '' else image
  45. # attributes
  46. if 'Infobox' in search_res:
  47. infobox = search_res.get('Infobox', None)
  48. if 'content' in infobox:
  49. for info in infobox.get('content'):
  50. attributes.append({'label': info.get('label'),
  51. 'value': info.get('value')})
  52. # urls
  53. for ddg_result in search_res.get('Results', []):
  54. if 'FirstURL' in ddg_result:
  55. firstURL = ddg_result.get('FirstURL', '')
  56. text = ddg_result.get('Text', '')
  57. urls.append({'title': text, 'url': firstURL})
  58. results.append({'title': heading, 'url': firstURL})
  59. # related topics
  60. for ddg_result in search_res.get('RelatedTopics', []):
  61. if 'FirstURL' in ddg_result:
  62. suggestion = result_to_text(ddg_result.get('FirstURL', None),
  63. ddg_result.get('Text', None),
  64. ddg_result.get('Result', None))
  65. if suggestion != heading:
  66. results.append({'suggestion': suggestion})
  67. elif 'Topics' in ddg_result:
  68. suggestions = []
  69. relatedTopics.append({'name': ddg_result.get('Name', ''),
  70. 'suggestions': suggestions})
  71. for topic_result in ddg_result.get('Topics', []):
  72. suggestion = result_to_text(topic_result.get('FirstURL', None),
  73. topic_result.get('Text', None),
  74. topic_result.get('Result', None))
  75. if suggestion != heading:
  76. suggestions.append(suggestion)
  77. # abstract
  78. abstractURL = search_res.get('AbstractURL', '')
  79. if abstractURL != '':
  80. # add as result ? problem always in english
  81. infobox_id = abstractURL
  82. urls.append({'title': search_res.get('AbstractSource'),
  83. 'url': abstractURL})
  84. # definition
  85. definitionURL = search_res.get('DefinitionURL', '')
  86. if definitionURL != '':
  87. # add as result ? as answer ? problem always in english
  88. infobox_id = definitionURL
  89. urls.append({'title': search_res.get('DefinitionSource'),
  90. 'url': definitionURL})
  91. # to merge with wikidata's infobox
  92. if infobox_id:
  93. infobox_id = http_regex.sub('https:', infobox_id)
  94. # entity
  95. entity = search_res.get('Entity', None)
  96. # TODO continent / country / department / location / waterfall /
  97. # mountain range :
  98. # link to map search, get weather, near by locations
  99. # TODO musician : link to music search
  100. # TODO concert tour : ??
  101. # TODO film / actor / television / media franchise :
  102. # links to IMDB / rottentomatoes (or scrap result)
  103. # TODO music : link tu musicbrainz / last.fm
  104. # TODO book : ??
  105. # TODO artist / playwright : ??
  106. # TODO compagny : ??
  107. # TODO software / os : ??
  108. # TODO software engineer : ??
  109. # TODO prepared food : ??
  110. # TODO website : ??
  111. # TODO performing art : ??
  112. # TODO prepared food : ??
  113. # TODO programming language : ??
  114. # TODO file format : ??
  115. if len(heading) > 0:
  116. # TODO get infobox.meta.value where .label='article_title'
  117. if image is None and len(attributes) == 0 and len(urls) == 1 and\
  118. len(relatedTopics) == 0 and len(content) == 0:
  119. results.append({
  120. 'url': urls[0]['url'],
  121. 'title': heading,
  122. 'content': content
  123. })
  124. else:
  125. results.append({
  126. 'infobox': heading,
  127. 'id': infobox_id,
  128. 'entity': entity,
  129. 'content': content,
  130. 'img_src': image,
  131. 'attributes': attributes,
  132. 'urls': urls,
  133. 'relatedTopics': relatedTopics
  134. })
  135. return results