#----------------------------- http://www.perl.com/CPAN/modules/by-category/15_World_Wide_Web_HTML_HTTP_CGI/ #----------------------------- |
#----------------------------- use LWP::Simple; $content = get($URL); #----------------------------- use LWP::Simple; unless (defined ($content = get $URL)) { die "could not get $URL\n"; } #----------------------------- # download the following standalone program #!/usr/bin/perl -w # titlebytes - find the title and size of documents use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use URI::Heuristic; my $raw_url = shift or die "usage: $0 url\n"; my $url = URI::Heuristic::uf_urlstr($raw_url); $| = 1; # to flush next line printf "%s =>\n\t", $url; my $ua = LWP::UserAgent->new(); $ua->agent("Schmozilla/v9.14 Platinum"); # give it time, it'll get there my $req = HTTP::Request->new(GET => $url); $req->referer("http://wizard.yellowbrick.oz"); # perplex the log analysers my $response = $ua->request($req); if ($response->is_error()) { printf " %s\n", $response->status_line; } else { my $count; my $bytes; my $content = $response->content(); $bytes = length $content; $count = ($content =~ tr/\n/\n/); printf "%s (%d lines, %d bytes)\n", $response->title(), $count, $bytes; } #----------------------------- #% titlebytes http://www.tpj.com/ #http://www.tpj.com/ => # The Perl Journal (109 lines, 4530 bytes) #----------------------------- |
#----------------------------- use LWP::Simple; use URI::URL; my $url = url('http://www.perl.com/cgi-bin/cpan_mod'); $url->query_form(module => 'DB_File', readme => 1); $content = get($url); #----------------------------- use HTTP::Request::Common qw(POST); use LWP::UserAgent; $ua = LWP::UserAgent->new(); my $req = POST 'http://www.perl.com/cgi-bin/cpan_mod', [ module => 'DB_File', readme => 1 ]; $content = $ua->request($req)->as_string; #----------------------------- field1=value1&field2=value2&field3=value3 #----------------------------- http://www.site.com/path/to/ script.cgi?field1=value1&field2=value2&field3=value3 #----------------------------- http://www.site.com/path/to/ script.cgi?arg=%22this+isn%27t+%3CEASY%3E+%26+%3CFUN%3E%22 #----------------------------- $ua->proxy(['http', 'ftp'] => 'http://proxy.myorg.com:8081'); #----------------------------- |
#----------------------------- use HTML::LinkExtor; $parser = HTML::LinkExtor->new(undef, $base_url); $parser->parse_file($filename); @links = $parser->links; foreach $linkarray (@links) { my @element = @$linkarray; my $elt_type = shift @element; # element type # possibly test whether this is an element we're interested in while (@element) { # extract the next attribute and its value my ($attr_name, $attr_value) = splice(@element, 0, 2); # ... do something with them ... } } #----------------------------- <A HREF="http://www.perl.com/">Home page</A> <IMG SRC="images/big.gif" LOWSRC="images/big-lowres.gif"> #----------------------------- [ [ a, href => "http://www.perl.com/" ], [ img, src => "images/big.gif", lowsrc => "images/big-lowres.gif" ] ] #----------------------------- if ($elt_type eq 'a' && $attr_name eq 'href') { print "ANCHOR: $attr_value\n" if $attr_value->scheme =~ /http|ftp/; } if ($elt_type eq 'img' && $attr_name eq 'src') { print "IMAGE: $attr_value\n"; } #----------------------------- # download the following standalone program #!/usr/bin/perl -w # xurl - extract unique, sorted list of links from URL use HTML::LinkExtor; use LWP::Simple; $base_url = shift; $parser = HTML::LinkExtor->new(undef, $base_url); $parser->parse(get($base_url))->eof; @links = $parser->links; foreach $linkarray (@links) { my @element = @$linkarray; my $elt_type = shift @element; while (@element) { my ($attr_name , $attr_value) = splice(@element, 0, 2); $seen{$attr_value}++; } } for (sort keys %seen) { print $_, "\n" } #----------------------------- #% xurl http://www.perl.com/CPAN #ftp://ftp@ftp.perl.com/CPAN/CPAN.html # #http://language.perl.com/misc/CPAN.cgi # #http://language.perl.com/misc/cpan_module # #http://language.perl.com/misc/getcpan # #http://www.perl.com/index.html # #http://www.perl.com/gifs/lcb.xbm #----------------------------- <URL:http://www.perl.com> #----------------------------- @URLs = ($message =~ /<URL:(.*?)>/g); #----------------------------- |
#----------------------------- # download the following standalone program #!/usr/bin/perl -w -p00 # text2html - trivial html encoding of normal text # -p means apply this script to each record. # -00 mean that a record is now a paragraph use HTML::Entities; $_ = encode_entities($_, "\200-\377"); if (/^\s/) { # Paragraphs beginning with whitespace are wrapped in <PRE> s{(.*)$} {<PRE>\n$1</PRE>\n}s; # indented verbatim } else { s{^(>.*)} {$1<BR>}gm; # quoted text s{<URL:(.*?)>} {<A HREF="$1">$1</A>}gs # embedded URL (good) || s{(http:\S+)} {<A HREF="$1">$1</A>}gs; # guessed URL (bad) s{\*(\S+)\*} {<STRONG>$1</STRONG>}g; # this is *bold* here s{\b_(\S+)\_\b} {<EM>$1</EM>}g; # this is _italics_ here s{^} {<P>\n}; # add paragraph tag } #----------------------------- BEGIN { print "<TABLE>"; $_ = encode_entities(scalar <>); s/\n\s+/ /g; # continuation lines while ( /^(\S+?:)\s*(.*)$/gm ) { # parse heading print "<TR><TH ALIGN='LEFT'>$1</TH><TD>$2</TD></TR>\n"; } print "</TABLE><HR>"; } #----------------------------- |
#----------------------------- $ascii = `lynx -dump $filename`; #----------------------------- use HTML::FormatText; use HTML::Parse; $html = parse_htmlfile($filename); $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 50); $ascii = $formatter->format($html); #----------------------------- use HTML::TreeBuilder; use HTML::FormatText; $html = HTML::TreeBuilder->new(); $html->parse($document); $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 50); $ascii = $formatter->format($html); #----------------------------- |
#----------------------------- ($plain_text = $html_text) =~ s/<[^>]*>//gs; #WRONG #----------------------------- use HTML::Parse; use HTML::FormatText; $plain_text = HTML::FormatText->new->format(parse_html($html_text)); #----------------------------- #% perl -pe 's/<[^>]*>//g' file #----------------------------- #<IMG SRC = "foo.gif" # ALT = "Flurp!"> #----------------------------- #% perl -0777 -pe 's/<[^>]*>//gs' file #----------------------------- { local $/; # temporary whole-file input mode $html = <FILE>; $html =~ s/<[^>]*>//gs; } #----------------------------- #<IMG SRC = "foo.gif" ALT = "A > B"> # #<!-- <A comment> --> # #<script>if (a<b && a>c)</script> # #<# Just data #> # #<![INCLUDE CDATA [ >>>>>>>>>>>> ]]> #----------------------------- #<!-- This section commented out. # <B>You can't see me!</B> #--> #----------------------------- package MyParser; use HTML::Parser; use HTML::Entities qw(decode_entities); @ISA = qw(HTML::Parser); sub text { my($self, $text) = @_; print decode_entities($text); } package main; MyParser->new->parse_file(*F); #----------------------------- ($title) = ($html =~ m#<TITLE>\s*(.*?)\s*</TITLE>#is); #----------------------------- # download the following standalone program #!/usr/bin/perl # htitle - get html title from URL die "usage: $0 url ...\n" unless @ARGV; require LWP; foreach $url (@ARGV) { $ua = LWP::UserAgent->new(); $res = $ua->request(HTTP::Request->new(GET => $url)); print "$url: " if @ARGV > 1; if ($res->is_success) { print $res->title, "\n"; } else { print $res->status_line, "\n"; } } #----------------------------- #% htitle http://www.ora.com #www.oreilly.com -- Welcome to O'Reilly & Associates! # #% htitle http://www.perl.com/ http://www.perl.com/nullvoid #http://www.perl.com/: The www.perl.com Home Page #http://www.perl.com/nullvoid: 404 File Not Found #----------------------------- |
#----------------------------- # download the following standalone program #!/usr/bin/perl -w # churl - check urls use HTML::LinkExtor; use LWP::Simple qw(get head); $base_url = shift or die "usage: $0 <start_url>\n"; $parser = HTML::LinkExtor->new(undef, $base_url); $parser->parse(get($base_url)); @links = $parser->links; print "$base_url: \n"; foreach $linkarray (@links) { my @element = @$linkarray; my $elt_type = shift @element; while (@element) { my ($attr_name , $attr_value) = splice(@element, 0, 2); if ($attr_value->scheme =~ /\b(ftp|https?|file)\b/) { print " $attr_value: ", head($attr_value) ? "OK" : "BAD", "\n"; } } } #----------------------------- #% churl http://www.wizards.com #http://www.wizards.com: # # FrontPage/FP_Color.gif: OK # # FrontPage/FP_BW.gif: BAD # # #FP_Map: OK # # Games_Library/Welcome.html: OK #----------------------------- |
#----------------------------- # download the following standalone program #!/usr/bin/perl -w # surl - sort URLs by their last modification date use LWP::UserAgent; use HTTP::Request; use URI::URL qw(url); my($url, %Date); my $ua = LWP::UserAgent->new(); while ( $url = url(scalar <>) ) { my($req, $ans); next unless $url->scheme =~ /^(file|https?)$/; $ans = $ua->request(HTTP::Request->new("HEAD", $url)); if ($ans->is_success) { $Date{$url} = $ans->last_modified || 0; # unknown } else { print STDERR "$url: Error [", $ans->code, "] ", $ans->message, "!\n"; } } foreach $url ( sort { $Date{$b} <=> $Date{$a} } keys %Date ) { printf "%-25s %s\n", $Date{$url} ? (scalar localtime $Date{$url}) : "<NONE SPECIFIED>", $url; } #----------------------------- #% xurl http://www.perl.com/ | surl | head #Mon Apr 20 06:16:02 1998 http://electriclichen.com/linux/srom.html # #Fri Apr 17 13:38:51 1998 http://www.oreilly.com/ # #Fri Mar 13 12:16:47 1998 http://www2.binevolve.com/ # #Sun Mar 8 21:01:27 1998 http://www.perl.org/ # #Tue Nov 18 13:41:32 1997 http://www.perl.com/universal/header.map # #Wed Oct 1 12:55:13 1997 http://www.songline.com/ # #Sun Aug 17 21:43:51 1997 http://www.perl.com/graphics/perlhome_header.jpg # #Sun Aug 17 21:43:47 1997 http://www.perl.com/graphics/perl_id_313c.gif # #Sun Aug 17 21:43:46 1997 http://www.perl.com/graphics/ora_logo.gif # #Sun Aug 17 21:43:44 1997 http://www.perl.com/graphics/header-nav.gif #----------------------------- |
#----------------------------- sub template { my ($filename, $fillings) = @_; my $text; local $/; # slurp mode (undef) local *F; # create local filehandle open(F, "< $filename\0") || return; $text = <F>; # read whole file close(F); # ignore retval # replace quoted words with value in %$fillings hash $text =~ s{ %% ( .*? ) %% } { exists( $fillings->{$1} ) ? $fillings->{$1} : "" }gsex; return $text; } #----------------------------- #<!-- simple.template for internal template() function --> #<HTML><HEAD><TITLE>Report for %%username%%</TITLE></HEAD> #<BODY><H1>Report for %%username%%</H1> #%%username%% logged in %%count%% times, for a total of %%total%% minutes. #----------------------------- #<!-- fancy.template for Text::Template --> #<HTML><HEAD><TITLE>Report for {$user}</TITLE></HEAD> #<BODY><H1>Report for {$user}</H1> #{ lcfirst($user) } logged in {$count} times, for a total of #{ int($total / 60) } minutes. #----------------------------- %fields = ( username => $whats_his_name, count => $login_count, total => $minute_used, ); print template("/home/httpd/templates/simple.template", \%fields); #----------------------------- # download the following standalone program #!/usr/bin/perl -w # userrep1 - report duration of user logins using SQL database use DBI; use CGI qw(:standard); # template() defined as in the Solution section above $user = param("username") or die "No username"; $dbh = DBI->connect("dbi:mysql:connections:mysql.domain.com:3306", "connections", "seekritpassword") or die "Couldn't connect\n"; $sth = $dbh->prepare(<<"END_OF_SELECT") or die "Couldn't prepare SQL"; SELECT COUNT(duration),SUM(duration) FROM logins WHERE username='$user' END_OF_SELECT # this time the duration is assumed to be in seconds if (@row = $sth->fetchrow()) { ($count, $seconds) = @row; } else { ($count, $seconds) = (0,0); } $sth->finish(); $dbh->disconnect; print header(); print template("report.tpl", { 'username' => $user, 'count' => $count, 'total' => $total }); #----------------------------- You owe: {$total} #----------------------------- The average was {$count ? ($total/$count) : 0}. #----------------------------- # download the following standalone program #!/usr/bin/perl -w # userrep2 - report duration of user logins using SQL database use Text::Template; use DBI; use CGI qw(:standard); $tmpl = "/home/httpd/templates/fancy.template"; $template = Text::Template->new(-type => "file", -source => $tmpl); $user = param("username") or die "No username"; $dbh = DBI->connect("dbi:mysql:connections:mysql.domain.com:3306", "connections", "secret passwd") or die "Couldn't db connect\n"; $sth = $dbh->prepare(<<"END_OF_SELECT") or die "Couldn't prepare SQL"; SELECT COUNT(duration),SUM(duration) FROM logins WHERE username='$user' END_OF_SELECT $sth->execute() or die "Couldn't execute SQL"; if (@row = $sth->fetchrow()) { ($count, $total) = @row; } else { $count = $total = 0; } $sth->finish(); $dbh->disconnect; print header(); print $template->fill_in(); #----------------------------- |
#----------------------------- use LWP::Simple; mirror($URL, $local_filename); #----------------------------- |
#----------------------------- use LWP::RobotUA; $ua = LWP::RobotUA->new('websnuffler/0.1', 'me@wherever.com'); #----------------------------- 403 (Forbidden) Forbidden by robots.txt #----------------------------- #% GET http://www.webtechniques.com/robots.txt #User-agent: * # # Disallow: /stats # # Disallow: /db # # Disallow: /logs # # Disallow: /store # # Disallow: /forms # # Disallow: /gifs # # Disallow: /wais-src # # Disallow: /scripts # # Disallow: /config #----------------------------- #% GET http://www.cnn.com/robots.txt | head ## robots, scram # ## $I d : robots.txt,v 1.2 1998/03/10 18:27:01 mreed Exp $ # #User-agent: * # #Disallow: / # #User-agent: Mozilla/3.01 (hotwired-test/0.1) # #Disallow: /cgi-bin # #Disallow: /TRANSCRIPTS # #Disallow: /development #----------------------------- |
#----------------------------- while (<LOGFILE>) { my ($client, $identuser, $authuser, $date, $time, $tz, $method, $url, $protocol, $status, $bytes) = /^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+) "(\S+) (.*?) (\S+)" (\S+) (\S+)$/ or next; # ... } #----------------------------- |
#----------------------------- # download the following standalone program #!/usr/bin/perl -w # sumwww - summarize web server log activity $lastdate = ""; daily_logs(); summary(); exit; # read CLF files and tally hits from the host and to the URL sub daily_logs { while (<>) { ($type, $what) = /"(GET|POST)\s+(\S+?) \S+"/ or next; ($host, undef, undef, $datetime) = split; ($bytes) = /\s(\d+)\s*$/ or next; ($date) = ($datetime =~ /\[([^:]*)/); $posts += ($type eq POST); $home++ if m, / ,; if ($date ne $lastdate) { if ($lastdate) { write_report() } else { $lastdate = $date } } $count++; $hosts{$host}++; $what{$what}++; $bytesum += $bytes; } write_report() if $count; } # use *typeglob aliasing of global variables for cheap copy sub summary { $lastdate = "Grand Total"; *count = *sumcount; *bytesum = *bytesumsum; *hosts = *allhosts; *posts = *allposts; *what = *allwhat; *home = *allhome; write; } # display the tallies of hosts and URLs, using formats sub write_report { write; # add to summary data $lastdate = $date; $sumcount += $count; $bytesumsum += $bytesum; $allposts += $posts; $allhome += $home; # reset daily data $posts = $count = $bytesum = $home = 0; @allwhat{keys %what} = keys %what; @allhosts{keys %hosts} = keys %hosts; %hosts = %what = (); } format STDOUT_TOP = @|||||||||| @|||||| @||||||| @||||||| @|||||| @|||||| @||||||||||||| "Date", "Hosts", "Accesses", "Unidocs", "POST", "Home", "Bytes" ----------- ------- -------- -------- ------- ------- -------------- . format STDOUT = @>>>>>>>>>> @>>>>>> @>>>>>>> @>>>>>>> @>>>>>> @>>>>>> @>>>>>>>>>>>>> $lastdate, scalar(keys %hosts), $count, scalar(keys %what), $posts, $home, $bytesum . #----------------------------- # Date Hosts Accesses Unidocs POST Home Bytes # # ----------- ------- -------- -------- ------- ------- -------------- # # 19/May/1998 353 6447 3074 352 51 16058246 # # 20/May/1998 1938 23868 4288 972 350 61879643 # # 21/May/1998 1775 27872 6596 1064 376 64613798 # # 22/May/1998 1680 21402 4467 735 285 52437374 # # 23/May/1998 1128 21260 4944 592 186 55623059 # # Grand Total 6050 100849 10090 3715 1248 250612120 #----------------------------- # download the following standalone program #!/usr/bin/perl -w # aprept - report on Apache logs use Logfile::Apache; $l = Logfile::Apache->new( File => "-", # STDIN Group => [ Domain, File ]); $l->report(Group => Domain, Sort => Records); $l->report(Group => File, List => [Bytes,Records]); #----------------------------- # Domain Records # # =============================== # # US Commercial 222 38.47% # # US Educational 115 19.93% # # Network 93 16.12% # # Unresolved 54 9.36% # # Australia 48 8.32% # # Canada 20 3.47% # # Mexico 8 1.39% # # United Kingdom 6 1.04% # # # File Bytes Records # # ========================================================= # # / 13008 0.89% 6 1.04% # # /cgi-bin/MxScreen 11870 0.81% 2 0.35% # # /cgi-bin/pickcards 39431 2.70% 48 8.32% # # /deckmaster 143793 9.83% 21 3.64% # # /deckmaster/admin 54447 3.72% 3 0.52% #----------------------------- |
#----------------------------- #<HTML><HEAD><TITLE>Hi!</TITLE></HEAD><BODY> #<H1>Welcome to Scooby World!</H1> #I have <A HREF="pictures.html">pictures</A> of the crazy dog #himself. Here's one!<P> #<IMG SRC="scooby.jpg" ALT="Good doggy!"><P> #<BLINK>He's my hero!</BLINK> I would like to meet him some day, #and get my picture taken with him.<P> #P.S. I am deathly ill. <A HREF="shergold.html">Please send #cards</A>. #</BODY></HTML> #----------------------------- #% htmlsub picture photo scooby.html #<HTML><HEAD><TITLE>Hi!</TITLE></HEAD><BODY> # #<H1>Welcome to Scooby World!</H1> # #I have <A HREF="pictures.html">photos</A> of the crazy dog # #himself. Here's one!<P> # #<IMG SRC="scooby.jpg" ALT="Good doggy!"><P> # #<BLINK>He's my hero!</BLINK> I would like to meet him some day, # #and get my photo taken with him.<P> # #P.S. I am deathly ill. <A HREF="shergold.html">Please send # #cards</A>. # #</BODY></HTML> #----------------------------- # download the following standalone program #!/usr/bin/perl -w # htmlsub - make substitutions in normal text of HTML files # from Gisle Aas <gisle@aas.no> sub usage { die "Usage: $0 <from> <to> <file>...\n" } my $from = shift or usage; my $to = shift or usage; usage unless @ARGV; # Build the HTML::Filter subclass to do the substituting. package MyFilter; require HTML::Filter; @ISA=qw(HTML::Filter); use HTML::Entities qw(decode_entities encode_entities); sub text { my $self = shift; my $text = decode_entities($_[0]); $text =~ s/\Q$from/$to/go; # most important line $self->SUPER::text(encode_entities($text)); } # Now use the class. package main; foreach (@ARGV) { MyFilter->new->parse_file($_); } #----------------------------- |
#----------------------------- #% hrefsub shergold.html cards.html scooby.html #<HTML><HEAD><TITLE>Hi!</TITLE></HEAD><BODY> # #<H1>Welcome to Scooby World!</H1> # #I have <A HREF="pictures.html">pictures</A> of the crazy dog # #himself. Here's one!<P> # #<IMG SRC="scooby.jpg" ALT="Good doggy!"><P> # #<BLINK>He's my hero!</BLINK> I would like to meet him some day, # #and get my picture taken with him.<P> # #P.S. I am deathly ill. <a href="cards.html">Please send # #cards</A>. # #</BODY></HTML> #----------------------------- # download the following standalone program #!/usr/bin/perl -w # hrefsub - make substitutions in <A HREF="..."> fields of HTML files # from Gisle Aas <gisle@aas.no> sub usage { die "Usage: $0 <from> <to> <file>...\n" } my $from = shift or usage; my $to = shift or usage; usage unless @ARGV; # The HTML::Filter subclass to do the substitution. package MyFilter; require HTML::Filter; @ISA=qw(HTML::Filter); use HTML::Entities qw(encode_entities); sub start { my($self, $tag, $attr, $attrseq, $orig) = @_; if ($tag eq 'a' && exists $attr->{href}) { if ($attr->{href} =~ s/\Q$from/$to/g) { # must reconstruct the start tag based on $tag and $attr. # wish we instead were told the extent of the 'href' value # in $orig. my $tmp = "<$tag"; for (@$attrseq) { my $encoded = encode_entities($attr->{$_}); $tmp .= qq( $_="$encoded "); } $tmp .= ">"; $self->output($tmp); return; } } $self->output($orig); } # Now use the class. package main; foreach (@ARGV) { MyFilter->new->parse_file($_); } #----------------------------- |