logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git

filecrop.py (2601B)


  1. from searx.url_utils import urlencode
  2. try:
  3. from HTMLParser import HTMLParser
  4. except:
  5. from html.parser import HTMLParser
  6. url = 'http://www.filecrop.com/'
  7. search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1&pos={index}' # noqa
  8. paging = True
  9. class FilecropResultParser(HTMLParser):
  10. def __init__(self):
  11. HTMLParser.__init__(self)
  12. self.__start_processing = False
  13. self.results = []
  14. self.result = {}
  15. self.tr_counter = 0
  16. self.data_counter = 0
  17. def handle_starttag(self, tag, attrs):
  18. if tag == 'tr':
  19. if ('bgcolor', '#edeff5') in attrs or\
  20. ('bgcolor', '#ffffff') in attrs:
  21. self.__start_processing = True
  22. if not self.__start_processing:
  23. return
  24. if tag == 'label':
  25. self.result['title'] = [attr[1] for attr in attrs
  26. if attr[0] == 'title'][0]
  27. elif tag == 'a' and ('rel', 'nofollow') in attrs\
  28. and ('class', 'sourcelink') in attrs:
  29. if 'content' in self.result:
  30. self.result['content'] += [attr[1] for attr in attrs
  31. if attr[0] == 'title'][0]
  32. else:
  33. self.result['content'] = [attr[1] for attr in attrs
  34. if attr[0] == 'title'][0]
  35. self.result['content'] += ' '
  36. elif tag == 'a':
  37. self.result['url'] = url + [attr[1] for attr in attrs
  38. if attr[0] == 'href'][0]
  39. def handle_endtag(self, tag):
  40. if self.__start_processing is False:
  41. return
  42. if tag == 'tr':
  43. self.tr_counter += 1
  44. if self.tr_counter == 2:
  45. self.__start_processing = False
  46. self.tr_counter = 0
  47. self.data_counter = 0
  48. self.results.append(self.result)
  49. self.result = {}
  50. def handle_data(self, data):
  51. if not self.__start_processing:
  52. return
  53. if 'content' in self.result:
  54. self.result['content'] += data + ' '
  55. else:
  56. self.result['content'] = data + ' '
  57. self.data_counter += 1
  58. def request(query, params):
  59. index = 1 + (params['pageno'] - 1) * 30
  60. params['url'] = search_url.format(query=urlencode({'w': query}), index=index)
  61. return params
  62. def response(resp):
  63. parser = FilecropResultParser()
  64. parser.feed(resp.text)
  65. return parser.results