extract-links.pl (1368B)
- #!/usr/bin/env perl
- # Multimedia-DL: Youtube-DL inspired scraper
- # Copyright © 2021 Multimedia-DL Authors <https://hacktivis.me/git/multimedia-dl/>
- # SPDX-License-Identifier: AGPL-3-only
- use strict;
- use utf8;
- use HTML::TreeBuilder;
- use HTML::TreeBuilder::XPath;
- use LWP::UserAgent;
- use URI;
- my $ua = LWP::UserAgent->new;
- my $webkit_ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15";
- $ua->agent($webkit_ua . "Multimedia-DL/1.0");
- if($#ARGV != 0) {
- print "usage: multimedia-dl <url>\n";
- exit 1;
- }
- my $req = HTTP::Request->new(GET => $ARGV[0]);
- my $res = $ua->request($req);
- sub scrap_html_response {
- my ($res) = @_;
- my $tree = HTML::TreeBuilder::XPath->new_from_content($res->content) or die "HTML parsing failed";
- foreach($tree->findvalues('//a/@href')) {
- print URI->new_abs($_, $res->base), "\n";
- }
- foreach($tree->findvalues('//link/@href')) {
- print URI->new_abs($_, $res->base), "\n";
- }
- foreach($tree->findvalues('//@src')) {
- print URI->new_abs($_, $res->base), "\n";
- }
- }
- if($res->is_success) {
- my $content_type = $res->header("Content-Type");
- if(($content_type == "text/html") or ($content_type == "application/xhtml+xml")) {
- scrap_html_response($res);
- } else {
- print "Doesn't seems to be HTML\n";
- }
- } else {
- print "Got ", $res->status_line, " instead of 2xx\n";
- }