logo

multimedia-dl

Unnamed repository; edit this file 'description' to name the repository.

extract-links.pl (1368B)


  1. #!/usr/bin/env perl
  2. # Multimedia-DL: Youtube-DL inspired scraper
  3. # Copyright © 2021 Multimedia-DL Authors <https://hacktivis.me/git/multimedia-dl/>
  4. # SPDX-License-Identifier: AGPL-3-only
  5. use strict;
  6. use utf8;
  7. use HTML::TreeBuilder;
  8. use HTML::TreeBuilder::XPath;
  9. use LWP::UserAgent;
  10. use URI;
  11. my $ua = LWP::UserAgent->new;
  12. my $webkit_ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15";
  13. $ua->agent($webkit_ua . "Multimedia-DL/1.0");
  14. if($#ARGV != 0) {
  15. print "usage: multimedia-dl <url>\n";
  16. exit 1;
  17. }
  18. my $req = HTTP::Request->new(GET => $ARGV[0]);
  19. my $res = $ua->request($req);
  20. sub scrap_html_response {
  21. my ($res) = @_;
  22. my $tree = HTML::TreeBuilder::XPath->new_from_content($res->content) or die "HTML parsing failed";
  23. foreach($tree->findvalues('//a/@href')) {
  24. print URI->new_abs($_, $res->base), "\n";
  25. }
  26. foreach($tree->findvalues('//link/@href')) {
  27. print URI->new_abs($_, $res->base), "\n";
  28. }
  29. foreach($tree->findvalues('//@src')) {
  30. print URI->new_abs($_, $res->base), "\n";
  31. }
  32. }
  33. if($res->is_success) {
  34. my $content_type = $res->header("Content-Type");
  35. if(($content_type == "text/html") or ($content_type == "application/xhtml+xml")) {
  36. scrap_html_response($res);
  37. } else {
  38. print "Doesn't seems to be HTML\n";
  39. }
  40. } else {
  41. print "Got ", $res->status_line, " instead of 2xx\n";
  42. }