summaryrefslogtreecommitdiff
path: root/archive/bin/extract_url.pl
diff options
context:
space:
mode:
Diffstat (limited to 'archive/bin/extract_url.pl')
-rwxr-xr-xarchive/bin/extract_url.pl967
1 files changed, 967 insertions, 0 deletions
diff --git a/archive/bin/extract_url.pl b/archive/bin/extract_url.pl
new file mode 100755
index 00000000..3d32a081
--- /dev/null
+++ b/archive/bin/extract_url.pl
@@ -0,0 +1,967 @@
+#!/usr/bin/env perl
+
+# License: BSD-2-Clause (simplified)
+# URL: http://spdx.org/licenses/BSD-2-Clause
+#
+# Copyright (C) 2011-2013 Kyle Wheeler
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY KYLE WHEELER "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL KYLE WHEELER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+use MIME::Parser;
+use HTML::Parser;
+use Getopt::Std;
+use Pod::Usage;
+use Env;
+use strict;
+use warnings;
+
+my $LICENSE = "BSD-2-Clause";
+my $NAME = "extract_url";
+my $version = "1.5.8";
+my $txtonly = 0;
+my $list = '';
+my $help = '';
+
+sub HELP_MESSAGE {
+ pod2usage(0);
+}
+sub VERSION_MESSAGE {
+ print "$NAME $version License:$LICENSE\n";
+}
+
+my %options;
+eval "use Getopt::Long";
+if ($@) {
+ $Getopt::Std::STANDARD_HELP_VERSION = 1;
+ &getopts("hltV",\%options) or pod2usage(-exitval=>2,-verbose=>1);
+} else {
+ &GetOptions('Version' => sub { VERSION_MESSAGE(); exit; },
+ 'help' => sub { pod2usage(-exitval=>0,-verbose=>1); },
+ 'man' => sub { pod2usage(-exitval=>0, -verbose=>99); },
+ 'text' => \$txtonly,
+ 'list!' => \$list) or pod2usage(-exitval=>2,-verbose=>1);
+}
+my $fancymenu = 1;
+if ($options{'l'} || length $list) { $fancymenu = 0; }
+if ($options{'V'}) { &VERSION_MESSAGE(); exit; }
+if ($options{'h'}) { &HELP_MESSAGE(); }
+
+# create a hash of html tag names that may have links
+my %link_attr = (
+ 'a' => {'href'=>1},
+ 'applet' => {'archive'=>1,'codebase'=>1,'code'=>1},
+ 'area' => {'href'=>1},
+ 'blockquote' => {'cite'=>1},
+ #'body' => {'background'=>1},
+ 'embed' => {'pluginspage'=>1, 'src'=>1},
+ 'form' => {'action'=>1},
+ 'frame' => {'src'=>1, 'longdesc'=>1},
+ 'iframe' => {'src'=>1, 'longdesc'=>1},
+ #'ilayer' => {'background'=>1},
+ #'img' => {'src'=>1},
+ 'input' => {'src'=>1, 'usemap'=>1},
+ 'ins' => {'cite'=>1},
+ 'isindex' => {'action'=>1},
+ 'head' => {'profile'=>1},
+ #'layer' => {'background'=>1, 'src'=>1},
+ 'layer' => {'src'=>1},
+ 'link' => {'href'=>1},
+ 'object' => {'classid'=>1, 'codebase'=>1, 'data'=>1, 'archive'=>1,
+ 'usemap'=>1},
+ 'q' => {'cite'=>1},
+ 'script' => {'src'=>1, 'for'=>1},
+ #'table' => {'background'=>1},
+ #'td' => {'background'=>1},
+ #'th' => {'background'=>1},
+ #'tr' => {'background'=>1},
+ 'xmp' => {'href'=>1},
+);
+
+# find out the URLVIEW command
+my $urlviewcommand="";
+my $displaysanitized = 0; # means to display the pre-sanitized URL instead of the pretty one
+my $shortcut = 0; # means open it without checking if theres only 1 URL
+my $noreview = 0; # means don't display overly-long URLs to be checked before opening
+my $persist = 0; # means don't exit after viewing a URL (ignored if $shortcut == 0)
+my $ignore_empty = 0; # means to throw out URLs that don't have text in HTML
+my $default_view = "url"; # means what shows up in the list by default: urls or contexts
+my $alt_select_key = 'k';
+sub getprefs
+{
+ if (open(PREFFILE,'<',$ENV{'HOME'}."/.extract_urlview")) {
+ while (<PREFFILE>) {
+ my $lineread = $_;
+ if ($lineread =~ /^ALTSELECT [A-Za-fh-z0-9,.<>?;:{}|!@#$%^&*()_=+-`~]$/) {
+ $lineread =~ /ALTSELECT (.)/; $alt_select_key = $1;
+ } elsif ($lineread =~ /^SHORTCUT$/) { $shortcut = 1;
+ } elsif ($lineread =~ /^NOREVIEW$/) { $noreview = 1;
+ } elsif ($lineread =~ /^PERSISTENT$/) { $persist = 1;
+ } elsif ($lineread =~ /^DISPLAY_SANITIZED$/) { $displaysanitized = 1;
+ } elsif ($lineread =~ /^IGNORE_EMPTY_TAGS$/) { $ignore_empty = 1;
+ } elsif ($lineread =~ /^COMMAND (.*)/) {
+ $lineread =~ /^COMMAND (.*)/;
+ $urlviewcommand=$1;
+ chomp $urlviewcommand;
+ } elsif ($lineread =~ /^DEFAULT_VIEW (.*)/) {
+ $lineread =~ /^DEFAULT_VIEW (.*)/;
+ if ($1 =~ /^context$/) {
+ $default_view = "context";
+ } else {
+ $default_view = "url";
+ }
+ } elsif ($lineread =~ /^HTML_TAGS (.*)/) {
+ $lineread =~ /^HTML_TAGS (.*)/;
+ my @tags = split(',', $1);
+ my %tags_hash;
+ foreach my $tag (@tags) {
+ $tags_hash{lc $tag} = 1;
+ }
+ foreach my $tag (keys %link_attr) {
+ delete $link_attr{$tag} if (! exists($tags_hash{$tag}));
+ }
+ }
+ }
+ close PREFFILE;
+ } elsif (open(URLVIEW,'<',$ENV{'HOME'}."/.urlview")) {
+ while (<URLVIEW>) {
+ if (/^COMMAND (.*)/) {
+ $urlviewcommand=$1;
+ chomp $urlviewcommand;
+ last;
+ }
+ }
+ close URLVIEW;
+ }
+ if ($urlviewcommand eq "") {
+ if (exists $ENV{BROWSER}) {
+ $urlviewcommand=$ENV{BROWSER};
+ } else {
+ $urlviewcommand = "open";
+ }
+ }
+}
+
+my %link_hash;
+my %orig_text;
+my $newlink = 1;
+sub foundurl {
+ my ($uri) = @_;
+ #$uri =~ s/mailto:(.*)/$1/;
+ if (! $link_hash{$uri}) {
+ $link_hash{$uri} = $newlink++;
+ }
+}
+my $foundurl_text_curindex = 0;
+my $foundurl_text_lastindex = 0;
+my $foundurl_text_prevurl = "";
+my $foundurl_text_text;
+
+sub foundurl_text {
+ my ($uri,$orig) = @_;
+ $uri = &renderuri($uri);
+ $foundurl_text_curindex = index($$foundurl_text_text, $orig, $foundurl_text_lastindex);
+ my $sincelast;
+ if ($foundurl_text_curindex >= 0) {
+ # this is the expected behavior
+ $sincelast = &tidytext(substr($$foundurl_text_text,$foundurl_text_lastindex,($foundurl_text_curindex-$foundurl_text_lastindex)));
+ } else {
+ # something odd is going on. What's happened is that our URL finder has
+ # found a URL that isn't in the text following the last URL it found.
+ # It *may* be doing things out of order... but that's really strange.
+ # We rely on it finding URLs in order of appearance in order to get
+ # context information. I'll try to recover but whatever happens, we
+ # can't get context information for this URL, and our context info for
+ # other URLs may be seriously messed up!
+ $foundurl_text_curindex = index($$foundurl_text_text, $orig);
+ if ($foundurl_text_curindex >= 0) {
+ # okay, we can recover... we'll just pretend that *everything* is
+ # the sincelast text
+ $sincelast = &tidytext(substr($$foundurl_text_text, 0, $foundurl_text_curindex));
+ } else {
+ # Very strange... I can't even find the URL! The best we can do is
+ # continue without *any* context... but there's *SERIOUS* weirdness
+ # going on, and expectations have been *majorly* violated. Let's
+ # just hope the URL is already closed (and so already has context
+ # information). I'm setting the curindex so that it'll be zero for
+ # the next URL (i.e. we can pretend that everything up to the next
+ # url is "sincelast")
+ $foundurl_text_curindex = 0 - length($orig);
+ }
+ $sincelast = "";
+ }
+ $sincelast =~ s/<$//;
+ $sincelast =~ s/^>//;
+ &foundurl($uri);
+ &process_sincelast($uri, $foundurl_text_prevurl, $sincelast);
+ $foundurl_text_lastindex = $foundurl_text_curindex + length($orig);
+ $foundurl_text_prevurl = $uri;
+}
+sub unfindurl {
+ my($uri) = @_;
+ delete($link_hash{$uri});
+ delete($orig_text{$uri});
+}
+sub renderuri {
+ my($uri) = @_;
+ $uri =~ s/&amp;/&/gs;
+ $uri =~ s/%([0-7][a-fA-F0-9])/chr(hex($1))/egs;
+ return $uri;
+}
+sub sanitizeuri {
+ my($uri) = @_;
+ $uri =~ s/([^a-zA-Z0-9_.!*()\@:=\?\/%~+-])/sprintf("%%%X",ord($1))/egs;
+ return $uri;
+}
+
+my $parser = new MIME::Parser;
+
+my %closedurls;
+
+sub process_sincelast
+{
+ my($url,$prev,$sincelast) = @_;
+ if (length($prev) > 0 && ! exists($closedurls{$prev})) {
+ $orig_text{$prev} .= " ".substr($sincelast,0,30);
+ $closedurls{$prev} = 1;
+ #print "URL(".$link_hash{$prev}.":".$newlink."): $prev ->\n\t".$orig_text{$prev}."\n\n";
+ }
+ if (! exists($closedurls{$url})) {
+ my $beforetext = substr $sincelast, -30;
+ if (length($beforetext)) {
+ $orig_text{$url} = "$beforetext =>URL<=";
+ } else {
+ $orig_text{$url} = "=>URL<=";
+ }
+ }
+}
+
+sub extract_url_from_text {
+ ($foundurl_text_text) = @_;
+ # The idea here is to eliminate duplicate URLs - I want the
+ # %link_hash to be full of URLs. My regex (in the else statement)
+ # is decent, but imperfect. URI::Find is better.
+ my $fancyfind=1;
+ eval "use URI::Find::Schemeless";
+ $fancyfind=0 if ($@);
+ if ($fancyfind == 1) {
+ my $finder = URI::Find::Schemeless->new(\&foundurl_text);
+ $finder->find($foundurl_text_text);
+ } else {
+ $$foundurl_text_text =~ s{(((mms|ftp|http|https)://|news:)[][A-Za-z0-9_.~!*'();:@&=+,/?%#-]+[^](,.'">;[:space:]]|(mailto:)?[-a-zA-Z_0-9.+]+@[-a-zA-Z_0-9.]+)}{
+ &foundurl_text($1,$1);
+ }eg;
+ }
+}
+
+my $seenstart = 0;
+my $seenurl = "";
+my $beforetext = "";
+my $extendedskipped = "";
+my $last10words = "";
+my $words_since_link_end = "";
+
+sub tidytext
+{
+ my ($text) = @_;
+ my %rendermap = (
+ '[\n]' => ' ',
+ '[\r]' => ' ',
+ '&#[0-9]+;' => '',
+ '&#x[0-9a-f]+;' => '',
+ '&nbsp;' => ' ',
+ '&copy;' => '(c)',
+ '&mdash;' => '---',
+ '&quot;' => '"',
+ '&apos;' => "'",
+ '&lt;' => '<',
+ '&gt;' => '>',
+ '&([ACEINOUY])(grave|acute|circ|tilde|uml|ring|cedil);' => '$1',
+ '&amp;' => '&',
+ '\s\s+' => ' ',
+ );
+ foreach my $entity (keys %rendermap) {
+ my $construct = '$text =~ s/$entity/'.$rendermap{$entity}.'/ig';
+ eval $construct;
+ }
+ $text =~ s/^\s+//;
+ $text =~ s/\s+$//;
+ return $text;
+}
+
+sub subwords
+{
+ my ($string, $minlen) = @_;
+ my @words = split(/\s+/, $string);
+ return "" if @words == 0;
+ my $retstr = $words[0];
+ my $wordcount = 1;
+ while (length($retstr) < $minlen && $wordcount < @words) {
+ $retstr .= " " . $words[$wordcount];
+ $wordcount++;
+ }
+ return $retstr;
+}
+
+sub sublastwords
+{
+ my ($string, $minlen) = @_;
+ my @words = split(/\s+/, $string);
+ return "" if @words == 0;
+ my $retstr = $words[@words-1];
+ my $wordcount = 1;
+ while (length($retstr) < $minlen && $wordcount < @words) {
+ $wordcount++;
+ $retstr = $words[@words - $wordcount] . " $retstr";
+ }
+ return $retstr;
+}
+
+sub find_urls_rec
+{
+ my($ent) = @_;
+ #print "type: " . $ent->mime_type . " ... parts: ".$ent->parts."\n";
+ if ($ent->parts >= 1 or $ent->mime_type eq "multipart/mixed") {
+ for (my $i=0;$i<$ent->parts;$i++) {
+ find_urls_rec($ent->parts($i));
+ }
+ } else {
+ #print "type: " . $ent->mime_type . "\n";
+ if ($ent->mime_type eq "message/rfc822") { &find_urls_rec($ent->parts()); }
+ elsif ($ent->mime_type eq "text/html" ) {
+ my $parser = HTML::Parser->new(api_version=>3);
+ my $skipped_text = "";
+ #$parser->unbroken_text(1);
+ $parser->handler(start => sub {
+ my($tagname,$pos,$text) = @_;
+ if (my $link_attr = $link_attr{$tagname}) {
+ while (4 <= @$pos) {
+ my($k_offset, $k_len, $v_offset, $v_len) = splice(@$pos,-4);
+ my $attrname = lc(substr($text, $k_offset, $k_len));
+ next unless exists($link_attr->{$attrname});
+ next unless $v_offset; # 0 v_offset means no value
+
+ # This is REALLY hack-ish and fragile, but can
+ # sometimes be invaluable
+ &extract_url_from_text(\$skipped_text) if (length($skipped_text) > 0);
+
+ my $v = substr($text, $v_offset, $v_len);
+ $v =~ s/^([\'\"])(.*)\1$/$2/;
+ $v = &renderuri($v);
+ &foundurl($v);
+
+ $words_since_link_end .= " $skipped_text";
+ $last10words = &tidytext("$last10words $skipped_text");
+ $last10words = &sublastwords($last10words, 50);
+ $skipped_text = "";
+
+ $words_since_link_end = &tidytext($words_since_link_end);
+ if (length($seenurl) > 0 && ! exists($closedurls{$seenurl})) {
+ my $since_words = &subwords($words_since_link_end, 40);
+ if (length($since_words) > 0) {
+ my $space = " ";
+ $space = "" if ($since_words =~ /^[.,;!?)-]/);
+ $orig_text{$seenurl} .= "$space$since_words";
+ }
+ $closedurls{$seenurl} = 1;
+ }
+
+ $beforetext = &sublastwords($last10words, 30);
+ $seenstart = 1;
+ $seenurl = $v;
+ }
+ }
+ },
+ "tagname, tokenpos, text");
+ $parser->handler(end => sub {
+ my ($text) = @_;
+ $last10words = &tidytext("$last10words $skipped_text");
+ $last10words = &sublastwords($last10words, 50);
+ if ($seenstart == 1) {
+ if (! exists($closedurls{$seenurl})) {
+ my $mtext = "=>$skipped_text<=";
+ if (length($beforetext)) {
+ my $space = " ";
+ $space = "" if ($beforetext =~ /[(-]$/);
+ $orig_text{$seenurl} = "$beforetext$space$mtext";
+ } else {
+ $orig_text{$seenurl} = "$mtext";
+ }
+ }
+ if (length($skipped_text) == 0 && $ignore_empty == 1 && ! exists($closedurls{$seenurl})) {
+ &unfindurl($seenurl);
+ }
+ $seenstart = 0;
+ $extendedskipped .= " $skipped_text";
+ $words_since_link_end = "";
+ } else {
+ $words_since_link_end .= " $skipped_text";
+ }
+ $skipped_text = "";
+ },"text");
+ # the "text" handler is used, rather than skipped_text because
+ # otherwise blocks of text at the beginning of a "lightly html-ified"
+ # document can be ignored.
+ $parser->handler(text => sub {
+ my ($text) = @_;
+ $skipped_text = &tidytext("$skipped_text $text");
+ }, "text");
+ $parser->parse($ent->bodyhandle->as_string);
+ $parser->eof;
+ if (length($words_since_link_end) > 0) {
+ # This is REALLY hack-ish and fragile, but can
+ # sometimes be invaluable
+ &extract_url_from_text(\$words_since_link_end);
+ }
+ if (length($skipped_text) > 0) {
+ &extract_url_from_text(\$skipped_text);
+ }
+ } elsif ($ent->mime_type =~ /text\/.*/) {
+ $ent->head->unfold;
+ my $ctype = $ent->head->get('Content-type');
+ if (defined($ctype) and $ctype =~ m/format=flowed/) {
+ my @lines = $ent->bodyhandle->as_lines;
+ chomp(@lines);
+ my $body = "";
+ my $delsp;
+ if ($ctype =~ /delsp=yes/) {
+ #print "delsp=yes!\n";
+ $delsp=1;
+ } else {
+ #print "delsp=no!\n";
+ $delsp=0;
+ }
+ for (my $i=0;$i<@lines;$i++) {
+ my $col = 0;
+ my $quotetext = "";
+ #print "=> " . $lines[$i] . "\n";
+ while (substr($lines[$i],$col,1) eq ">") {
+ $quotetext .= ">";
+ $col++;
+ }
+ if ($col > 0) { $body .= "$quotetext "; }
+ while ($lines[$i] =~ / $/ && $lines[$i] =~ /^$quotetext[^>]/ && $lines[$i+1] =~ /^$quotetext[^>]/) {
+ my $line;
+ if ($delsp) {
+ $line = substr($lines[$i],$col,length($lines[$i])-$col-1);
+ } else {
+ $line = substr($lines[$i],$col);
+ }
+ $line =~ s/^\s+//;
+ $body .= $line;
+ $i++;
+ }
+ if ($lines[$i] =~ /^$quotetext[^>]/) {
+ my $line = substr($lines[$i],$col);
+ $line =~ s/^\s+//;
+ $body .= $line."\n";
+ }
+ }
+ &extract_url_from_text(\$body);
+ } else {
+ &extract_url_from_text(\$ent->bodyhandle->as_string);
+ }
+ }
+ }
+}
+
+sub urlwrap {
+ my($subseq,$text,$linelen,$breaker) = @_;
+ my $len = length($text);
+ my $i = 0;
+ my $output = "";
+ while ($len > $linelen) {
+ if ($i > 0) { $output .= $subseq; }
+ my $breakpoint = -1;
+ my $chunk = substr($text,$i,$linelen);
+ my @chars = ("!","*","'","(",")",";",":","@","&","=","+",",","/","?","%","#","[","]","-","_");
+ foreach my $chr ( @chars ) {
+ my $pt = rindex($chunk,$chr);
+ if ($breakpoint < $pt) { $breakpoint = $pt; }
+ }
+ if ($breakpoint == -1) { $breakpoint = $linelen; }
+ else { $breakpoint += 1; }
+ $output .= substr($text,$i,$breakpoint) . $breaker;
+ if ($i == 0) { $linelen -= length($subseq); }
+ $len -= $breakpoint;
+ $i += $breakpoint;
+ }
+ if ($i > 0) { $output .= $subseq; }
+ $output .= substr($text,$i);
+ return $output;
+}
+
+sub isOutputScreen {
+ use POSIX;
+ return 0 if POSIX::isatty( \*STDOUT) eq "" ; # pipe
+ return 1; # screen
+} # end of isOutputScreen
+
+&getprefs();
+$parser->output_to_core(1);
+my $filecontents;
+if ($#ARGV == 0) {
+ open(INPUT, "<$ARGV[0]") or die "Couldn't open input file $ARGV[0]: $!";
+ $filecontents = join('',<INPUT>);
+ close(INPUT);
+} else {
+ die "no input provided!\n" if POSIX::isatty( \*STDIN) ne "" ; # pipe
+ $filecontents = join('',<STDIN>);
+}
+
+if (not $txtonly) {
+ my $entity = $parser->parse_data($filecontents);
+ &find_urls_rec($entity);
+ if (scalar(keys %link_hash) == 0) {
+ &extract_url_from_text(\$filecontents);
+ }
+} else {
+ my @lines = <INPUT>; # slurp in the whole file
+ my $filebody = join("", @lines); # generate a single string from those lines
+ &extract_url_from_text(\$filebody);
+}
+
+if (&isOutputScreen) {
+ eval "use Curses::UI";
+ $fancymenu = 0 if ($@);
+} else {
+ $fancymenu = 0;
+}
+
+if ($fancymenu == 1) {
+ #use strict;
+
+ # This is the shortcut...
+ if ($shortcut == 1 && 1 == scalar keys %link_hash) {
+ my ($url) = each %link_hash;
+ $url = &sanitizeuri($url);
+ if ($urlviewcommand =~ m/%s/) {
+ $urlviewcommand =~ s/%s/'$url'/g;
+ } else {
+ $urlviewcommand .= " $url";
+ }
+ system $urlviewcommand;
+ exit 0;
+ }
+
+ # Curses support really REALLY wants to own STDIN
+ close(STDIN);
+ open(STDIN,"/dev/tty"); # looks like a hack, smells like a hack...
+
+ my $cui = new Curses::UI(
+ -color_support => 1,
+ -clear_on_exit => 1
+ );
+ my $wrapwidth = $cui->width() - 2;
+ my %listhash_url;
+ my %listhash_context;
+ my @listvals;
+ # $link_hash{url} = ordering of the urls in the document as first-seen
+ foreach my $url (sort {$link_hash{$a} <=> $link_hash{$b} } keys(%link_hash)) {
+ push(@listvals,$link_hash{$url});
+ if ($displaysanitized) {
+ $listhash_url{$link_hash{$url}} = &sanitizeuri($url);
+ } else {
+ $listhash_url{$link_hash{$url}} = $url;
+ }
+ $listhash_context{$link_hash{$url}} = $orig_text{$url};
+ }
+
+ my @menu = (
+ { -label => 'Keys: q=quit m=menu s=switch-view c=context g=top G=bottom',
+ -submenu => [
+ { -label => 'About a', -value => \&about },
+ { -label => 'Show Command C', -value => \&show_command },
+ { -label => 'Switch List View s', -value => \&switch_list },
+ { -label => 'Exit ^q', -value => \&exit_dialog }
+ ],
+ },
+ );
+ my $menu = $cui->add(
+ 'menu','Menubar',
+ -menu => \@menu,
+ );
+ my $win1 = $cui->add(
+ 'win1', 'Window',
+ -border => 1,
+ -y => 1,
+ -bfg => 'green',
+ );
+ sub about()
+ {
+ $cui->dialog(
+ -message => "$NAME $version License:$LICENSE"
+ );
+ }
+ sub show_command()
+ {
+ # This extra sprintf work is to ensure that the title
+ # is fully displayed even if $urlviewcommand is short
+ my $title = "The configured URL viewing command is:";
+ my $len = length($title);
+ my $cmd = sprintf("%-${len}s",$urlviewcommand);
+ $cui->dialog(
+ -title => "The configured URL viewing command is:",
+ -message => $cmd,
+ );
+ }
+ sub exit_dialog()
+ {
+ my $return = $cui->dialog(
+ -message => "Do you really want to quit?",
+ -buttons => ['yes', 'no'],
+ );
+ exit(0) if $return;
+ }
+
+ my $listbox_labels;
+ if ($default_view eq "url") {
+ $listbox_labels = \%listhash_url;
+ } else {
+ $listbox_labels = \%listhash_context;
+ }
+ my $listbox = $win1->add(
+ 'mylistbox', 'Listbox',
+ -values => \@listvals,
+ -labels => $listbox_labels,
+ );
+ $cui->set_binding(sub {$menu->focus()}, "\cX");
+ $cui->set_binding(sub {$menu->focus()}, "m");
+ $cui->set_binding( sub{exit}, "q" );
+ $cui->set_binding( \&exit_dialog , "\cQ");
+ $cui->set_binding( sub{exit} , "\cc");
+ $cui->set_binding(\&switch_list, "s");
+ $cui->set_binding(\&about, "a");
+ $cui->set_binding(\&show_command, "C");
+ $listbox->set_binding( 'option-last', "G");
+ $listbox->set_binding( 'option-first', "g");
+ sub switch_list()
+ {
+ if ($listbox_labels == \%listhash_url) {
+ $listbox->labels(\%listhash_context);
+ $listbox_labels = \%listhash_context;
+ } elsif ($listbox_labels == \%listhash_context) {
+ $listbox->labels(\%listhash_url);
+ $listbox_labels = \%listhash_url;
+ }
+ $listbox->focus();
+ }
+ sub madeselection_sub {
+ my ($stayopen) = @_;
+ my $rawurl = $listhash_url{$listbox->get_active_value()};
+ my $url = &sanitizeuri($rawurl);
+ my $command = $urlviewcommand;
+ if ($command =~ m/%s/) {
+ $command =~ s/%s/'$url'/g;
+ } else {
+ $command .= " $url";
+ }
+ my $return = 1;
+ if ($noreview != 1 && length($rawurl) > ($cui->width()-2)) {
+ $return = $cui->dialog(
+ -message => &urlwrap(" ",$rawurl,$cui->width()-8,"\n"),
+ -title => "Your Choice:",
+ -buttons => ['ok', 'cancel'],
+ );
+ }
+ if ($return) {
+ system $command;
+ if ($stayopen == 0) {
+ exit 0 if ($persist == 0);
+ } else {
+ exit 0 unless ($persist == 0);
+ }
+ }
+ }
+ sub madeselection { &madeselection_sub(0); }
+ sub altexit_madeselection { &madeselection_sub(1); }
+ $cui->set_binding( \&madeselection, " ");
+ $listbox->set_routine('option-select',\&madeselection);
+ $cui->set_binding( \&altexit_madeselection, $alt_select_key);
+ use Text::Wrap;
+ sub contextual {
+ my $rawurl = $listhash_url{$listbox->get_active_value()};
+ $Text::Wrap::columns = $cui->width()-8;
+ if (exists($orig_text{$rawurl}) && length($orig_text{$rawurl}) > 1) {
+ $cui->dialog(
+ -message => wrap('','',$orig_text{$rawurl}),
+ -title => "Context:",
+ -buttons => ['ok'],
+ );
+ } else {
+ $cui->error(
+ -message => "Sorry, I don't have any context for this link",
+ -buttons => ['ok'],
+ -bfg => 'red',
+ -tfg => 'red',
+ -fg => 'red',
+ );
+ }
+ }
+ $cui->set_binding( \&contextual, "c");
+
+ $listbox->focus();
+ $cui->mainloop();
+} else {
+ # using this as a pass-thru to URLVIEW
+ foreach my $value (sort {$link_hash{$a} <=> $link_hash{$b} } keys %link_hash)
+ {
+ $value = &sanitizeuri($value);
+ print "$value\n";
+ }
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+extract_url -- extract URLs from email messages
+
+=head1 SYNOPSIS
+
+extract_url [options] I<file>
+
+=head1 DESCRIPTION
+
+This is a Perl script that extracts URLs from correctly-encoded
+I<MIME> email messages. This can be used either as a pre-parser for
+I<urlview>, or to replace I<urlview> entirely.
+
+I<Urlview> is a great program, but has some deficiencies. In particular,
+it isn't particularly configurable, and cannot handle URLs that have
+been broken over several lines in I<format=flowed delsp=yes> email
+messages. Nor can it handle I<quoted-printable> email messages. Also,
+I<urlview> doesn't eliminate duplicate URLs. This Perl script handles
+all of that. It also sanitizes URLs so that they can't break out of the
+command shell.
+
+This is designed primarily for use with the I<mutt> emailer. The idea is
+that if you want to access a URL in an email, you pipe the email to a
+URL extractor (like this one) which then lets you select a URL to view
+in some third program (such as Firefox). An alternative design is to
+access URLs from within mutt's pager by defining macros and tagging the
+URLs in the display to indicate which macro to use. A script you can use
+to do that is I<tagurl.pl>.
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<-h, --help>
+
+Display this help and exit.
+
+=item B<-m, --man>
+
+Display the full man page documentation.
+
+=item B<-l, --list>
+
+Prevent use of Ncurses, and simply output a list of extracted URLs.
+
+=item B<-t, --text>
+
+Prevent MIME handling; treat the input as plain text.
+
+=item B<-V, --version>
+
+Output version information and exit.
+
+=back
+
+=head1 DEPENDENCIES
+
+Mandatory dependencies are B<MIME::Parser> and B<HTML::Parser>. These
+usually come with Perl.
+
+Optional dependencies are B<URI::Find> (recognizes more exotic URL
+variations in plain text (without HTML tags)), B<Curses::UI> (allows it
+to fully replace I<urlview>), and B<Getopt::Long> (if present,
+B<extract_url.pl> recognizes long options --version and --list).
+
+=head1 EXAMPLES
+
+This Perl script expects a valid email to be either piped in via STDIN or in a
+file listed as the script's only argument. Its STDOUT can be a pipe into
+I<urlview> (it will detect this). Here's how you can use it:
+
+ cat message.txt | extract_url.pl
+ cat message.txt | extract_url.pl | urlview
+ extract_url.pl message.txt
+ extract_url.pl message.txt | urlview
+
+For use with B<mutt 1.4.x>, here's a macro you can use:
+
+ macro index,pager \cb "\
+ <enter-command> \
+ unset pipe_decode<enter>\
+ <pipe-message>extract_url.pl<enter>" \
+ "get URLs"
+
+For use with B<mutt 1.5.x>, here's a more complicated macro you can use:
+
+ macro index,pager \cb "\
+ <enter-command> set my_pdsave=\$pipe_decode<enter>\
+ <enter-command> unset pipe_decode<enter>\
+ <pipe-message>extract_url.pl<enter>\
+ <enter-command> set pipe_decode=\$my_pdsave<enter>" \
+ "get URLs"
+
+Here's a suggestion for how to handle I<encrypted email>:
+
+ macro index,pager ,b "\
+ <enter-command> set my_pdsave=\$pipe_decode<enter>\
+ <enter-command> unset pipe_decode<enter>\
+ <pipe-message>extract_url.pl<enter>\
+ <enter-command> set pipe_decode=\$my_pdsave<enter>" \
+ "get URLs"
+
+ macro index,pager ,B "\
+ <enter-command> set my_pdsave=\$pipe_decode<enter>\
+ <enter-command> set pipe_decode<enter>\
+ <pipe-message>extract_url.pl<enter>\
+ <enter-command> set pipe_decode=\$my_pdsave<enter>" \
+ "decrypt message, then get URLs"
+
+ message-hook . 'macro index,pager \cb ,b "URL viewer"'
+ message-hook ~G 'macro index,pager \cb ,B "URL viewer"'
+
+=head1 CONFIGURATION
+
+If you're using it with B<Curses::UI> (i.e. as a standalone URL
+selector), this Perl script will try and figure out what command to use
+based on the contents of your F<~/.urlview> file. However, it also has
+its own configuration file (F<~/.extract_urlview>) that will be used
+instead, if it exists. So far, there are eight kinds of lines you can
+have in this file:
+
+=over 8
+
+=item COMMAND ...
+
+This line specifies the command that will be used to view URLs. This
+command CAN contain a I<%s>, which will be replaced by the URL inside
+single-quotes. If it does not contain a I<%s>, the URL will simply be
+appended to the command. If this line is not present, the command is
+taken from the environment variable $BROWSER. If BROWSER is not set, the
+command is assumed to be "open", which is the correct command for MacOS X
+systems.
+
+=item SHORTCUT
+
+This line specifies that if an email contains only 1 URL, that URL will
+be opened without prompting. The default (without this line) is to
+always prompt.
+
+=item NOREVIEW
+
+Normally, if a URL is too long to display on screen in the menu, the
+user will be prompted with the full URL before opening it, just to make
+sure it's correct. This line turns that behavior off.
+
+=item PERSISTENT
+
+By default, when a URL has been selected and viewed from the menu,
+B<extract_url.pl> will exit. If you would like it to be ready to view
+another URL without re-parsing the email (i.e. much like standard
+I<urlview> behavior), add this line to the config file.
+
+=item IGNORE_EMPTY_TAGS
+
+By default, the script collects all the URLs it can find. Sometimes,
+though, HTML messages contain links that don't correspond to any text
+(and aren't normally rendered or accessible). This tells the script to
+ignore these links.
+
+=item HTML_TAGS ...
+
+This line specifies which HTML tags will be examined for URLs. By
+default, the script is very generous, looking in I<a>, I<applet>,
+I<area>, I<blockquote>, I<embed>, I<form>, I<frame>, I<iframe>,
+I<input>, I<ins>, I<isindex>, I<head>, I<layer>, I<link>, I<object>,
+I<q>, I<script,> and I<xmp> tags for links. If you would like it to
+examine just a subset of these (e.g. you only want a tags to be
+examined), merely list the subset you want. The list is expected to be a
+comma-separated list. If there are multiple of these lines in the config
+file, the script will look for the minimum set of specified tags.
+
+=item ALTSELECT ...
+
+This line specifies a key for an alternate url viewing behavior. By
+default, B<extract_url.pl> will quit after the URL viewer has been
+launched for the selected URL. This key will then make B<extract_url.pl>
+launch the URL viewer but will not quit. However, if I<PERSISTENT> is
+specified in the config file, the opposite is true: normal selection of
+a URL will launch the URL viewer and will not cause B<extract_url.pl> to
+exit, but this key will. This setting defaults to I<k>.
+
+=item DEFAULT_VIEW {url|context}
+
+This line specifies whether to show the list of URLs at first or to show
+the url contexts when the program is run. By default, B<extract_url.pl>
+shows a list of URLs.
+
+=back
+
+Here is an example config file:
+
+ SHORTCUT
+ COMMAND mozilla-firefox -remote "openURL(%s,new-window)"
+ HTML_TAGS a,iframe,link
+ ALTSELECT Q
+ DEFAULT_VIEW context
+
+=head1 STANDARDS
+
+None.
+
+=head1 AVAILABILITY
+
+http://www.memoryhole.net/~kyle/extract_url/
+
+=head1 SEE ALSO
+
+mutt(1)
+urlview(1)
+urlscan(1)
+
+=head1 CAVEATS
+
+All URLs have any potentially dangerous shell characters (namely a
+single quote and a dollar sign) removed (transformed into
+I<percent-encoding>) before they are used in a shell. This should
+eliminate the possibility of a bad URL breaking the shell.
+
+If using B<Curses::UI>, and a URL is too big for your terminal, when you
+select it, B<extract_url.pl> will (by default) ask you to review it in a
+way that you can see the whole thing.
+
+=head1 AUTHOR
+
+Program was written by Kyle Wheeler <kyle@memoryhole.net>
+
+Released under license BSD-2-Cluase (simplified) For more information
+about the license, visit <http://spdx.org/licenses/BSD-2-Clause>.
+
+=cut