diff options
Diffstat (limited to 'archive/bin/extract_url.pl')
-rwxr-xr-x | archive/bin/extract_url.pl | 967 |
1 files changed, 967 insertions, 0 deletions
diff --git a/archive/bin/extract_url.pl b/archive/bin/extract_url.pl new file mode 100755 index 00000000..3d32a081 --- /dev/null +++ b/archive/bin/extract_url.pl @@ -0,0 +1,967 @@ +#!/usr/bin/env perl + +# License: BSD-2-Clause (simplified) +# URL: http://spdx.org/licenses/BSD-2-Clause +# +# Copyright (C) 2011-2013 Kyle Wheeler +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY KYLE WHEELER "AS IS" AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL KYLE WHEELER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +use MIME::Parser; +use HTML::Parser; +use Getopt::Std; +use Pod::Usage; +use Env; +use strict; +use warnings; + +my $LICENSE = "BSD-2-Clause"; +my $NAME = "extract_url"; +my $version = "1.5.8"; +my $txtonly = 0; +my $list = ''; +my $help = ''; + +sub HELP_MESSAGE { + pod2usage(0); +} +sub VERSION_MESSAGE { + print "$NAME $version License:$LICENSE\n"; +} + +my %options; +eval "use Getopt::Long"; +if ($@) { + $Getopt::Std::STANDARD_HELP_VERSION = 1; + &getopts("hltV",\%options) or pod2usage(-exitval=>2,-verbose=>1); +} else { + &GetOptions('Version' => sub { VERSION_MESSAGE(); exit; }, + 'help' => sub { pod2usage(-exitval=>0,-verbose=>1); }, + 'man' => sub { pod2usage(-exitval=>0, -verbose=>99); }, + 'text' => \$txtonly, + 'list!' => \$list) or pod2usage(-exitval=>2,-verbose=>1); +} +my $fancymenu = 1; +if ($options{'l'} || length $list) { $fancymenu = 0; } +if ($options{'V'}) { &VERSION_MESSAGE(); exit; } +if ($options{'h'}) { &HELP_MESSAGE(); } + +# create a hash of html tag names that may have links +my %link_attr = ( + 'a' => {'href'=>1}, + 'applet' => {'archive'=>1,'codebase'=>1,'code'=>1}, + 'area' => {'href'=>1}, + 'blockquote' => {'cite'=>1}, + #'body' => {'background'=>1}, + 'embed' => {'pluginspage'=>1, 'src'=>1}, + 'form' => {'action'=>1}, + 'frame' => {'src'=>1, 'longdesc'=>1}, + 'iframe' => {'src'=>1, 'longdesc'=>1}, + #'ilayer' => {'background'=>1}, + #'img' => {'src'=>1}, + 'input' => {'src'=>1, 'usemap'=>1}, + 'ins' => {'cite'=>1}, + 'isindex' => {'action'=>1}, + 'head' => {'profile'=>1}, + #'layer' => {'background'=>1, 'src'=>1}, + 'layer' => {'src'=>1}, + 'link' => {'href'=>1}, + 'object' => {'classid'=>1, 'codebase'=>1, 'data'=>1, 'archive'=>1, + 'usemap'=>1}, + 'q' => {'cite'=>1}, + 'script' => {'src'=>1, 'for'=>1}, + #'table' => {'background'=>1}, + #'td' => {'background'=>1}, + #'th' => {'background'=>1}, + #'tr' => {'background'=>1}, + 'xmp' => {'href'=>1}, +); + +# find out the URLVIEW command +my $urlviewcommand=""; +my $displaysanitized = 0; # means to display the pre-sanitized URL instead of the pretty one +my $shortcut = 0; # means open it without checking if theres only 1 URL +my $noreview = 0; # means don't display overly-long URLs to be checked before opening +my $persist = 0; # means don't exit after viewing a URL (ignored if $shortcut == 0) +my $ignore_empty = 0; # means to throw out URLs that don't have text in HTML +my $default_view = "url"; # means what shows up in the list by default: urls or contexts +my $alt_select_key = 'k'; +sub getprefs +{ + if (open(PREFFILE,'<',$ENV{'HOME'}."/.extract_urlview")) { + while (<PREFFILE>) { + my $lineread = $_; + if ($lineread =~ /^ALTSELECT [A-Za-fh-z0-9,.<>?;:{}|!@#$%^&*()_=+-`~]$/) { + $lineread =~ /ALTSELECT (.)/; $alt_select_key = $1; + } elsif ($lineread =~ /^SHORTCUT$/) { $shortcut = 1; + } elsif ($lineread =~ /^NOREVIEW$/) { $noreview = 1; + } elsif ($lineread =~ /^PERSISTENT$/) { $persist = 1; + } elsif ($lineread =~ /^DISPLAY_SANITIZED$/) { $displaysanitized = 1; + } elsif ($lineread =~ /^IGNORE_EMPTY_TAGS$/) { $ignore_empty = 1; + } elsif ($lineread =~ /^COMMAND (.*)/) { + $lineread =~ /^COMMAND (.*)/; + $urlviewcommand=$1; + chomp $urlviewcommand; + } elsif ($lineread =~ /^DEFAULT_VIEW (.*)/) { + $lineread =~ /^DEFAULT_VIEW (.*)/; + if ($1 =~ /^context$/) { + $default_view = "context"; + } else { + $default_view = "url"; + } + } elsif ($lineread =~ /^HTML_TAGS (.*)/) { + $lineread =~ /^HTML_TAGS (.*)/; + my @tags = split(',', $1); + my %tags_hash; + foreach my $tag (@tags) { + $tags_hash{lc $tag} = 1; + } + foreach my $tag (keys %link_attr) { + delete $link_attr{$tag} if (! exists($tags_hash{$tag})); + } + } + } + close PREFFILE; + } elsif (open(URLVIEW,'<',$ENV{'HOME'}."/.urlview")) { + while (<URLVIEW>) { + if (/^COMMAND (.*)/) { + $urlviewcommand=$1; + chomp $urlviewcommand; + last; + } + } + close URLVIEW; + } + if ($urlviewcommand eq "") { + if (exists $ENV{BROWSER}) { + $urlviewcommand=$ENV{BROWSER}; + } else { + $urlviewcommand = "open"; + } + } +} + +my %link_hash; +my %orig_text; +my $newlink = 1; +sub foundurl { + my ($uri) = @_; + #$uri =~ s/mailto:(.*)/$1/; + if (! $link_hash{$uri}) { + $link_hash{$uri} = $newlink++; + } +} +my $foundurl_text_curindex = 0; +my $foundurl_text_lastindex = 0; +my $foundurl_text_prevurl = ""; +my $foundurl_text_text; + +sub foundurl_text { + my ($uri,$orig) = @_; + $uri = &renderuri($uri); + $foundurl_text_curindex = index($$foundurl_text_text, $orig, $foundurl_text_lastindex); + my $sincelast; + if ($foundurl_text_curindex >= 0) { + # this is the expected behavior + $sincelast = &tidytext(substr($$foundurl_text_text,$foundurl_text_lastindex,($foundurl_text_curindex-$foundurl_text_lastindex))); + } else { + # something odd is going on. What's happened is that our URL finder has + # found a URL that isn't in the text following the last URL it found. + # It *may* be doing things out of order... but that's really strange. + # We rely on it finding URLs in order of appearance in order to get + # context information. I'll try to recover but whatever happens, we + # can't get context information for this URL, and our context info for + # other URLs may be seriously messed up! + $foundurl_text_curindex = index($$foundurl_text_text, $orig); + if ($foundurl_text_curindex >= 0) { + # okay, we can recover... we'll just pretend that *everything* is + # the sincelast text + $sincelast = &tidytext(substr($$foundurl_text_text, 0, $foundurl_text_curindex)); + } else { + # Very strange... I can't even find the URL! The best we can do is + # continue without *any* context... but there's *SERIOUS* weirdness + # going on, and expectations have been *majorly* violated. Let's + # just hope the URL is already closed (and so already has context + # information). I'm setting the curindex so that it'll be zero for + # the next URL (i.e. we can pretend that everything up to the next + # url is "sincelast") + $foundurl_text_curindex = 0 - length($orig); + } + $sincelast = ""; + } + $sincelast =~ s/<$//; + $sincelast =~ s/^>//; + &foundurl($uri); + &process_sincelast($uri, $foundurl_text_prevurl, $sincelast); + $foundurl_text_lastindex = $foundurl_text_curindex + length($orig); + $foundurl_text_prevurl = $uri; +} +sub unfindurl { + my($uri) = @_; + delete($link_hash{$uri}); + delete($orig_text{$uri}); +} +sub renderuri { + my($uri) = @_; + $uri =~ s/&/&/gs; + $uri =~ s/%([0-7][a-fA-F0-9])/chr(hex($1))/egs; + return $uri; +} +sub sanitizeuri { + my($uri) = @_; + $uri =~ s/([^a-zA-Z0-9_.!*()\@:=\?\/%~+-])/sprintf("%%%X",ord($1))/egs; + return $uri; +} + +my $parser = new MIME::Parser; + +my %closedurls; + +sub process_sincelast +{ + my($url,$prev,$sincelast) = @_; + if (length($prev) > 0 && ! exists($closedurls{$prev})) { + $orig_text{$prev} .= " ".substr($sincelast,0,30); + $closedurls{$prev} = 1; + #print "URL(".$link_hash{$prev}.":".$newlink."): $prev ->\n\t".$orig_text{$prev}."\n\n"; + } + if (! exists($closedurls{$url})) { + my $beforetext = substr $sincelast, -30; + if (length($beforetext)) { + $orig_text{$url} = "$beforetext =>URL<="; + } else { + $orig_text{$url} = "=>URL<="; + } + } +} + +sub extract_url_from_text { + ($foundurl_text_text) = @_; + # The idea here is to eliminate duplicate URLs - I want the + # %link_hash to be full of URLs. My regex (in the else statement) + # is decent, but imperfect. URI::Find is better. + my $fancyfind=1; + eval "use URI::Find::Schemeless"; + $fancyfind=0 if ($@); + if ($fancyfind == 1) { + my $finder = URI::Find::Schemeless->new(\&foundurl_text); + $finder->find($foundurl_text_text); + } else { + $$foundurl_text_text =~ s{(((mms|ftp|http|https)://|news:)[][A-Za-z0-9_.~!*'();:@&=+,/?%#-]+[^](,.'">;[:space:]]|(mailto:)?[-a-zA-Z_0-9.+]+@[-a-zA-Z_0-9.]+)}{ + &foundurl_text($1,$1); + }eg; + } +} + +my $seenstart = 0; +my $seenurl = ""; +my $beforetext = ""; +my $extendedskipped = ""; +my $last10words = ""; +my $words_since_link_end = ""; + +sub tidytext +{ + my ($text) = @_; + my %rendermap = ( + '[\n]' => ' ', + '[\r]' => ' ', + '&#[0-9]+;' => '', + '&#x[0-9a-f]+;' => '', + ' ' => ' ', + '©' => '(c)', + '—' => '---', + '"' => '"', + ''' => "'", + '<' => '<', + '>' => '>', + '&([ACEINOUY])(grave|acute|circ|tilde|uml|ring|cedil);' => '$1', + '&' => '&', + '\s\s+' => ' ', + ); + foreach my $entity (keys %rendermap) { + my $construct = '$text =~ s/$entity/'.$rendermap{$entity}.'/ig'; + eval $construct; + } + $text =~ s/^\s+//; + $text =~ s/\s+$//; + return $text; +} + +sub subwords +{ + my ($string, $minlen) = @_; + my @words = split(/\s+/, $string); + return "" if @words == 0; + my $retstr = $words[0]; + my $wordcount = 1; + while (length($retstr) < $minlen && $wordcount < @words) { + $retstr .= " " . $words[$wordcount]; + $wordcount++; + } + return $retstr; +} + +sub sublastwords +{ + my ($string, $minlen) = @_; + my @words = split(/\s+/, $string); + return "" if @words == 0; + my $retstr = $words[@words-1]; + my $wordcount = 1; + while (length($retstr) < $minlen && $wordcount < @words) { + $wordcount++; + $retstr = $words[@words - $wordcount] . " $retstr"; + } + return $retstr; +} + +sub find_urls_rec +{ + my($ent) = @_; + #print "type: " . $ent->mime_type . " ... parts: ".$ent->parts."\n"; + if ($ent->parts >= 1 or $ent->mime_type eq "multipart/mixed") { + for (my $i=0;$i<$ent->parts;$i++) { + find_urls_rec($ent->parts($i)); + } + } else { + #print "type: " . $ent->mime_type . "\n"; + if ($ent->mime_type eq "message/rfc822") { &find_urls_rec($ent->parts()); } + elsif ($ent->mime_type eq "text/html" ) { + my $parser = HTML::Parser->new(api_version=>3); + my $skipped_text = ""; + #$parser->unbroken_text(1); + $parser->handler(start => sub { + my($tagname,$pos,$text) = @_; + if (my $link_attr = $link_attr{$tagname}) { + while (4 <= @$pos) { + my($k_offset, $k_len, $v_offset, $v_len) = splice(@$pos,-4); + my $attrname = lc(substr($text, $k_offset, $k_len)); + next unless exists($link_attr->{$attrname}); + next unless $v_offset; # 0 v_offset means no value + + # This is REALLY hack-ish and fragile, but can + # sometimes be invaluable + &extract_url_from_text(\$skipped_text) if (length($skipped_text) > 0); + + my $v = substr($text, $v_offset, $v_len); + $v =~ s/^([\'\"])(.*)\1$/$2/; + $v = &renderuri($v); + &foundurl($v); + + $words_since_link_end .= " $skipped_text"; + $last10words = &tidytext("$last10words $skipped_text"); + $last10words = &sublastwords($last10words, 50); + $skipped_text = ""; + + $words_since_link_end = &tidytext($words_since_link_end); + if (length($seenurl) > 0 && ! exists($closedurls{$seenurl})) { + my $since_words = &subwords($words_since_link_end, 40); + if (length($since_words) > 0) { + my $space = " "; + $space = "" if ($since_words =~ /^[.,;!?)-]/); + $orig_text{$seenurl} .= "$space$since_words"; + } + $closedurls{$seenurl} = 1; + } + + $beforetext = &sublastwords($last10words, 30); + $seenstart = 1; + $seenurl = $v; + } + } + }, + "tagname, tokenpos, text"); + $parser->handler(end => sub { + my ($text) = @_; + $last10words = &tidytext("$last10words $skipped_text"); + $last10words = &sublastwords($last10words, 50); + if ($seenstart == 1) { + if (! exists($closedurls{$seenurl})) { + my $mtext = "=>$skipped_text<="; + if (length($beforetext)) { + my $space = " "; + $space = "" if ($beforetext =~ /[(-]$/); + $orig_text{$seenurl} = "$beforetext$space$mtext"; + } else { + $orig_text{$seenurl} = "$mtext"; + } + } + if (length($skipped_text) == 0 && $ignore_empty == 1 && ! exists($closedurls{$seenurl})) { + &unfindurl($seenurl); + } + $seenstart = 0; + $extendedskipped .= " $skipped_text"; + $words_since_link_end = ""; + } else { + $words_since_link_end .= " $skipped_text"; + } + $skipped_text = ""; + },"text"); + # the "text" handler is used, rather than skipped_text because + # otherwise blocks of text at the beginning of a "lightly html-ified" + # document can be ignored. + $parser->handler(text => sub { + my ($text) = @_; + $skipped_text = &tidytext("$skipped_text $text"); + }, "text"); + $parser->parse($ent->bodyhandle->as_string); + $parser->eof; + if (length($words_since_link_end) > 0) { + # This is REALLY hack-ish and fragile, but can + # sometimes be invaluable + &extract_url_from_text(\$words_since_link_end); + } + if (length($skipped_text) > 0) { + &extract_url_from_text(\$skipped_text); + } + } elsif ($ent->mime_type =~ /text\/.*/) { + $ent->head->unfold; + my $ctype = $ent->head->get('Content-type'); + if (defined($ctype) and $ctype =~ m/format=flowed/) { + my @lines = $ent->bodyhandle->as_lines; + chomp(@lines); + my $body = ""; + my $delsp; + if ($ctype =~ /delsp=yes/) { + #print "delsp=yes!\n"; + $delsp=1; + } else { + #print "delsp=no!\n"; + $delsp=0; + } + for (my $i=0;$i<@lines;$i++) { + my $col = 0; + my $quotetext = ""; + #print "=> " . $lines[$i] . "\n"; + while (substr($lines[$i],$col,1) eq ">") { + $quotetext .= ">"; + $col++; + } + if ($col > 0) { $body .= "$quotetext "; } + while ($lines[$i] =~ / $/ && $lines[$i] =~ /^$quotetext[^>]/ && $lines[$i+1] =~ /^$quotetext[^>]/) { + my $line; + if ($delsp) { + $line = substr($lines[$i],$col,length($lines[$i])-$col-1); + } else { + $line = substr($lines[$i],$col); + } + $line =~ s/^\s+//; + $body .= $line; + $i++; + } + if ($lines[$i] =~ /^$quotetext[^>]/) { + my $line = substr($lines[$i],$col); + $line =~ s/^\s+//; + $body .= $line."\n"; + } + } + &extract_url_from_text(\$body); + } else { + &extract_url_from_text(\$ent->bodyhandle->as_string); + } + } + } +} + +sub urlwrap { + my($subseq,$text,$linelen,$breaker) = @_; + my $len = length($text); + my $i = 0; + my $output = ""; + while ($len > $linelen) { + if ($i > 0) { $output .= $subseq; } + my $breakpoint = -1; + my $chunk = substr($text,$i,$linelen); + my @chars = ("!","*","'","(",")",";",":","@","&","=","+",",","/","?","%","#","[","]","-","_"); + foreach my $chr ( @chars ) { + my $pt = rindex($chunk,$chr); + if ($breakpoint < $pt) { $breakpoint = $pt; } + } + if ($breakpoint == -1) { $breakpoint = $linelen; } + else { $breakpoint += 1; } + $output .= substr($text,$i,$breakpoint) . $breaker; + if ($i == 0) { $linelen -= length($subseq); } + $len -= $breakpoint; + $i += $breakpoint; + } + if ($i > 0) { $output .= $subseq; } + $output .= substr($text,$i); + return $output; +} + +sub isOutputScreen { + use POSIX; + return 0 if POSIX::isatty( \*STDOUT) eq "" ; # pipe + return 1; # screen +} # end of isOutputScreen + +&getprefs(); +$parser->output_to_core(1); +my $filecontents; +if ($#ARGV == 0) { + open(INPUT, "<$ARGV[0]") or die "Couldn't open input file $ARGV[0]: $!"; + $filecontents = join('',<INPUT>); + close(INPUT); +} else { + die "no input provided!\n" if POSIX::isatty( \*STDIN) ne "" ; # pipe + $filecontents = join('',<STDIN>); +} + +if (not $txtonly) { + my $entity = $parser->parse_data($filecontents); + &find_urls_rec($entity); + if (scalar(keys %link_hash) == 0) { + &extract_url_from_text(\$filecontents); + } +} else { + my @lines = <INPUT>; # slurp in the whole file + my $filebody = join("", @lines); # generate a single string from those lines + &extract_url_from_text(\$filebody); +} + +if (&isOutputScreen) { + eval "use Curses::UI"; + $fancymenu = 0 if ($@); +} else { + $fancymenu = 0; +} + +if ($fancymenu == 1) { + #use strict; + + # This is the shortcut... + if ($shortcut == 1 && 1 == scalar keys %link_hash) { + my ($url) = each %link_hash; + $url = &sanitizeuri($url); + if ($urlviewcommand =~ m/%s/) { + $urlviewcommand =~ s/%s/'$url'/g; + } else { + $urlviewcommand .= " $url"; + } + system $urlviewcommand; + exit 0; + } + + # Curses support really REALLY wants to own STDIN + close(STDIN); + open(STDIN,"/dev/tty"); # looks like a hack, smells like a hack... + + my $cui = new Curses::UI( + -color_support => 1, + -clear_on_exit => 1 + ); + my $wrapwidth = $cui->width() - 2; + my %listhash_url; + my %listhash_context; + my @listvals; + # $link_hash{url} = ordering of the urls in the document as first-seen + foreach my $url (sort {$link_hash{$a} <=> $link_hash{$b} } keys(%link_hash)) { + push(@listvals,$link_hash{$url}); + if ($displaysanitized) { + $listhash_url{$link_hash{$url}} = &sanitizeuri($url); + } else { + $listhash_url{$link_hash{$url}} = $url; + } + $listhash_context{$link_hash{$url}} = $orig_text{$url}; + } + + my @menu = ( + { -label => 'Keys: q=quit m=menu s=switch-view c=context g=top G=bottom', + -submenu => [ + { -label => 'About a', -value => \&about }, + { -label => 'Show Command C', -value => \&show_command }, + { -label => 'Switch List View s', -value => \&switch_list }, + { -label => 'Exit ^q', -value => \&exit_dialog } + ], + }, + ); + my $menu = $cui->add( + 'menu','Menubar', + -menu => \@menu, + ); + my $win1 = $cui->add( + 'win1', 'Window', + -border => 1, + -y => 1, + -bfg => 'green', + ); + sub about() + { + $cui->dialog( + -message => "$NAME $version License:$LICENSE" + ); + } + sub show_command() + { + # This extra sprintf work is to ensure that the title + # is fully displayed even if $urlviewcommand is short + my $title = "The configured URL viewing command is:"; + my $len = length($title); + my $cmd = sprintf("%-${len}s",$urlviewcommand); + $cui->dialog( + -title => "The configured URL viewing command is:", + -message => $cmd, + ); + } + sub exit_dialog() + { + my $return = $cui->dialog( + -message => "Do you really want to quit?", + -buttons => ['yes', 'no'], + ); + exit(0) if $return; + } + + my $listbox_labels; + if ($default_view eq "url") { + $listbox_labels = \%listhash_url; + } else { + $listbox_labels = \%listhash_context; + } + my $listbox = $win1->add( + 'mylistbox', 'Listbox', + -values => \@listvals, + -labels => $listbox_labels, + ); + $cui->set_binding(sub {$menu->focus()}, "\cX"); + $cui->set_binding(sub {$menu->focus()}, "m"); + $cui->set_binding( sub{exit}, "q" ); + $cui->set_binding( \&exit_dialog , "\cQ"); + $cui->set_binding( sub{exit} , "\cc"); + $cui->set_binding(\&switch_list, "s"); + $cui->set_binding(\&about, "a"); + $cui->set_binding(\&show_command, "C"); + $listbox->set_binding( 'option-last', "G"); + $listbox->set_binding( 'option-first', "g"); + sub switch_list() + { + if ($listbox_labels == \%listhash_url) { + $listbox->labels(\%listhash_context); + $listbox_labels = \%listhash_context; + } elsif ($listbox_labels == \%listhash_context) { + $listbox->labels(\%listhash_url); + $listbox_labels = \%listhash_url; + } + $listbox->focus(); + } + sub madeselection_sub { + my ($stayopen) = @_; + my $rawurl = $listhash_url{$listbox->get_active_value()}; + my $url = &sanitizeuri($rawurl); + my $command = $urlviewcommand; + if ($command =~ m/%s/) { + $command =~ s/%s/'$url'/g; + } else { + $command .= " $url"; + } + my $return = 1; + if ($noreview != 1 && length($rawurl) > ($cui->width()-2)) { + $return = $cui->dialog( + -message => &urlwrap(" ",$rawurl,$cui->width()-8,"\n"), + -title => "Your Choice:", + -buttons => ['ok', 'cancel'], + ); + } + if ($return) { + system $command; + if ($stayopen == 0) { + exit 0 if ($persist == 0); + } else { + exit 0 unless ($persist == 0); + } + } + } + sub madeselection { &madeselection_sub(0); } + sub altexit_madeselection { &madeselection_sub(1); } + $cui->set_binding( \&madeselection, " "); + $listbox->set_routine('option-select',\&madeselection); + $cui->set_binding( \&altexit_madeselection, $alt_select_key); + use Text::Wrap; + sub contextual { + my $rawurl = $listhash_url{$listbox->get_active_value()}; + $Text::Wrap::columns = $cui->width()-8; + if (exists($orig_text{$rawurl}) && length($orig_text{$rawurl}) > 1) { + $cui->dialog( + -message => wrap('','',$orig_text{$rawurl}), + -title => "Context:", + -buttons => ['ok'], + ); + } else { + $cui->error( + -message => "Sorry, I don't have any context for this link", + -buttons => ['ok'], + -bfg => 'red', + -tfg => 'red', + -fg => 'red', + ); + } + } + $cui->set_binding( \&contextual, "c"); + + $listbox->focus(); + $cui->mainloop(); +} else { + # using this as a pass-thru to URLVIEW + foreach my $value (sort {$link_hash{$a} <=> $link_hash{$b} } keys %link_hash) + { + $value = &sanitizeuri($value); + print "$value\n"; + } +} + +__END__ + +=pod + +=head1 NAME + +extract_url -- extract URLs from email messages + +=head1 SYNOPSIS + +extract_url [options] I<file> + +=head1 DESCRIPTION + +This is a Perl script that extracts URLs from correctly-encoded +I<MIME> email messages. This can be used either as a pre-parser for +I<urlview>, or to replace I<urlview> entirely. + +I<Urlview> is a great program, but has some deficiencies. In particular, +it isn't particularly configurable, and cannot handle URLs that have +been broken over several lines in I<format=flowed delsp=yes> email +messages. Nor can it handle I<quoted-printable> email messages. Also, +I<urlview> doesn't eliminate duplicate URLs. This Perl script handles +all of that. It also sanitizes URLs so that they can't break out of the +command shell. + +This is designed primarily for use with the I<mutt> emailer. The idea is +that if you want to access a URL in an email, you pipe the email to a +URL extractor (like this one) which then lets you select a URL to view +in some third program (such as Firefox). An alternative design is to +access URLs from within mutt's pager by defining macros and tagging the +URLs in the display to indicate which macro to use. A script you can use +to do that is I<tagurl.pl>. + +=head1 OPTIONS + +=over 4 + +=item B<-h, --help> + +Display this help and exit. + +=item B<-m, --man> + +Display the full man page documentation. + +=item B<-l, --list> + +Prevent use of Ncurses, and simply output a list of extracted URLs. + +=item B<-t, --text> + +Prevent MIME handling; treat the input as plain text. + +=item B<-V, --version> + +Output version information and exit. + +=back + +=head1 DEPENDENCIES + +Mandatory dependencies are B<MIME::Parser> and B<HTML::Parser>. These +usually come with Perl. + +Optional dependencies are B<URI::Find> (recognizes more exotic URL +variations in plain text (without HTML tags)), B<Curses::UI> (allows it +to fully replace I<urlview>), and B<Getopt::Long> (if present, +B<extract_url.pl> recognizes long options --version and --list). + +=head1 EXAMPLES + +This Perl script expects a valid email to be either piped in via STDIN or in a +file listed as the script's only argument. Its STDOUT can be a pipe into +I<urlview> (it will detect this). Here's how you can use it: + + cat message.txt | extract_url.pl + cat message.txt | extract_url.pl | urlview + extract_url.pl message.txt + extract_url.pl message.txt | urlview + +For use with B<mutt 1.4.x>, here's a macro you can use: + + macro index,pager \cb "\ + <enter-command> \ + unset pipe_decode<enter>\ + <pipe-message>extract_url.pl<enter>" \ + "get URLs" + +For use with B<mutt 1.5.x>, here's a more complicated macro you can use: + + macro index,pager \cb "\ + <enter-command> set my_pdsave=\$pipe_decode<enter>\ + <enter-command> unset pipe_decode<enter>\ + <pipe-message>extract_url.pl<enter>\ + <enter-command> set pipe_decode=\$my_pdsave<enter>" \ + "get URLs" + +Here's a suggestion for how to handle I<encrypted email>: + + macro index,pager ,b "\ + <enter-command> set my_pdsave=\$pipe_decode<enter>\ + <enter-command> unset pipe_decode<enter>\ + <pipe-message>extract_url.pl<enter>\ + <enter-command> set pipe_decode=\$my_pdsave<enter>" \ + "get URLs" + + macro index,pager ,B "\ + <enter-command> set my_pdsave=\$pipe_decode<enter>\ + <enter-command> set pipe_decode<enter>\ + <pipe-message>extract_url.pl<enter>\ + <enter-command> set pipe_decode=\$my_pdsave<enter>" \ + "decrypt message, then get URLs" + + message-hook . 'macro index,pager \cb ,b "URL viewer"' + message-hook ~G 'macro index,pager \cb ,B "URL viewer"' + +=head1 CONFIGURATION + +If you're using it with B<Curses::UI> (i.e. as a standalone URL +selector), this Perl script will try and figure out what command to use +based on the contents of your F<~/.urlview> file. However, it also has +its own configuration file (F<~/.extract_urlview>) that will be used +instead, if it exists. So far, there are eight kinds of lines you can +have in this file: + +=over 8 + +=item COMMAND ... + +This line specifies the command that will be used to view URLs. This +command CAN contain a I<%s>, which will be replaced by the URL inside +single-quotes. If it does not contain a I<%s>, the URL will simply be +appended to the command. If this line is not present, the command is +taken from the environment variable $BROWSER. If BROWSER is not set, the +command is assumed to be "open", which is the correct command for MacOS X +systems. + +=item SHORTCUT + +This line specifies that if an email contains only 1 URL, that URL will +be opened without prompting. The default (without this line) is to +always prompt. + +=item NOREVIEW + +Normally, if a URL is too long to display on screen in the menu, the +user will be prompted with the full URL before opening it, just to make +sure it's correct. This line turns that behavior off. + +=item PERSISTENT + +By default, when a URL has been selected and viewed from the menu, +B<extract_url.pl> will exit. If you would like it to be ready to view +another URL without re-parsing the email (i.e. much like standard +I<urlview> behavior), add this line to the config file. + +=item IGNORE_EMPTY_TAGS + +By default, the script collects all the URLs it can find. Sometimes, +though, HTML messages contain links that don't correspond to any text +(and aren't normally rendered or accessible). This tells the script to +ignore these links. + +=item HTML_TAGS ... + +This line specifies which HTML tags will be examined for URLs. By +default, the script is very generous, looking in I<a>, I<applet>, +I<area>, I<blockquote>, I<embed>, I<form>, I<frame>, I<iframe>, +I<input>, I<ins>, I<isindex>, I<head>, I<layer>, I<link>, I<object>, +I<q>, I<script,> and I<xmp> tags for links. If you would like it to +examine just a subset of these (e.g. you only want a tags to be +examined), merely list the subset you want. The list is expected to be a +comma-separated list. If there are multiple of these lines in the config +file, the script will look for the minimum set of specified tags. + +=item ALTSELECT ... + +This line specifies a key for an alternate url viewing behavior. By +default, B<extract_url.pl> will quit after the URL viewer has been +launched for the selected URL. This key will then make B<extract_url.pl> +launch the URL viewer but will not quit. However, if I<PERSISTENT> is +specified in the config file, the opposite is true: normal selection of +a URL will launch the URL viewer and will not cause B<extract_url.pl> to +exit, but this key will. This setting defaults to I<k>. + +=item DEFAULT_VIEW {url|context} + +This line specifies whether to show the list of URLs at first or to show +the url contexts when the program is run. By default, B<extract_url.pl> +shows a list of URLs. + +=back + +Here is an example config file: + + SHORTCUT + COMMAND mozilla-firefox -remote "openURL(%s,new-window)" + HTML_TAGS a,iframe,link + ALTSELECT Q + DEFAULT_VIEW context + +=head1 STANDARDS + +None. + +=head1 AVAILABILITY + +http://www.memoryhole.net/~kyle/extract_url/ + +=head1 SEE ALSO + +mutt(1) +urlview(1) +urlscan(1) + +=head1 CAVEATS + +All URLs have any potentially dangerous shell characters (namely a +single quote and a dollar sign) removed (transformed into +I<percent-encoding>) before they are used in a shell. This should +eliminate the possibility of a bad URL breaking the shell. + +If using B<Curses::UI>, and a URL is too big for your terminal, when you +select it, B<extract_url.pl> will (by default) ask you to review it in a +way that you can see the whole thing. + +=head1 AUTHOR + +Program was written by Kyle Wheeler <kyle@memoryhole.net> + +Released under license BSD-2-Cluase (simplified) For more information +about the license, visit <http://spdx.org/licenses/BSD-2-Clause>. + +=cut |