#----------------------------- http://www.perl.com/CPAN/modules/by-category/15_World_Wide_Web_HTML_HTTP_CGI/ #----------------------------- |
#-----------------------------
use LWP::Simple;
$content = get($URL);
#-----------------------------
use LWP::Simple;
unless (defined ($content = get $URL)) {
die "could not get $URL\n";
}
#-----------------------------
# download the following standalone program
#!/usr/bin/perl -w
# titlebytes - find the title and size of documents
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use URI::Heuristic;
my $raw_url = shift or die "usage: $0 url\n";
my $url = URI::Heuristic::uf_urlstr($raw_url);
$| = 1; # to flush next line
printf "%s =>\n\t", $url;
my $ua = LWP::UserAgent->new();
$ua->agent("Schmozilla/v9.14 Platinum"); # give it time, it'll get there
my $req = HTTP::Request->new(GET => $url);
$req->referer("http://wizard.yellowbrick.oz");
# perplex the log analysers
my $response = $ua->request($req);
if ($response->is_error()) {
printf " %s\n", $response->status_line;
} else {
my $count;
my $bytes;
my $content = $response->content();
$bytes = length $content;
$count = ($content =~ tr/\n/\n/);
printf "%s (%d lines, %d bytes)\n", $response->title(), $count, $bytes; }
#-----------------------------
#% titlebytes http://www.tpj.com/
#http://www.tpj.com/ =>
# The Perl Journal (109 lines, 4530 bytes)
#-----------------------------
|
#-----------------------------
use LWP::Simple;
use URI::URL;
my $url = url('http://www.perl.com/cgi-bin/cpan_mod');
$url->query_form(module => 'DB_File', readme => 1);
$content = get($url);
#-----------------------------
use HTTP::Request::Common qw(POST);
use LWP::UserAgent;
$ua = LWP::UserAgent->new();
my $req = POST 'http://www.perl.com/cgi-bin/cpan_mod',
[ module => 'DB_File', readme => 1 ];
$content = $ua->request($req)->as_string;
#-----------------------------
field1=value1&field2=value2&field3=value3
#-----------------------------
http://www.site.com/path/to/
script.cgi?field1=value1&field2=value2&field3=value3
#-----------------------------
http://www.site.com/path/to/
script.cgi?arg=%22this+isn%27t+%3CEASY%3E+%26+%3CFUN%3E%22
#-----------------------------
$ua->proxy(['http', 'ftp'] => 'http://proxy.myorg.com:8081');
#-----------------------------
|
#-----------------------------
use HTML::LinkExtor;
$parser = HTML::LinkExtor->new(undef, $base_url);
$parser->parse_file($filename);
@links = $parser->links;
foreach $linkarray (@links) {
my @element = @$linkarray;
my $elt_type = shift @element; # element type
# possibly test whether this is an element we're interested in
while (@element) {
# extract the next attribute and its value
my ($attr_name, $attr_value) = splice(@element, 0, 2);
# ... do something with them ...
}
}
#-----------------------------
<A HREF="http://www.perl.com/">Home page</A>
<IMG SRC="images/big.gif" LOWSRC="images/big-lowres.gif">
#-----------------------------
[
[ a, href => "http://www.perl.com/" ],
[ img, src => "images/big.gif",
lowsrc => "images/big-lowres.gif" ]
]
#-----------------------------
if ($elt_type eq 'a' && $attr_name eq 'href') {
print "ANCHOR: $attr_value\n"
if $attr_value->scheme =~ /http|ftp/;
}
if ($elt_type eq 'img' && $attr_name eq 'src') {
print "IMAGE: $attr_value\n";
}
#-----------------------------
# download the following standalone program
#!/usr/bin/perl -w
# xurl - extract unique, sorted list of links from URL
use HTML::LinkExtor;
use LWP::Simple;
$base_url = shift;
$parser = HTML::LinkExtor->new(undef, $base_url);
$parser->parse(get($base_url))->eof;
@links = $parser->links;
foreach $linkarray (@links) {
my @element = @$linkarray;
my $elt_type = shift @element;
while (@element) {
my ($attr_name , $attr_value) = splice(@element, 0, 2);
$seen{$attr_value}++;
}
}
for (sort keys %seen) { print $_, "\n" }
#-----------------------------
#% xurl http://www.perl.com/CPAN
#ftp://ftp@ftp.perl.com/CPAN/CPAN.html
#
#http://language.perl.com/misc/CPAN.cgi
#
#http://language.perl.com/misc/cpan_module
#
#http://language.perl.com/misc/getcpan
#
#http://www.perl.com/index.html
#
#http://www.perl.com/gifs/lcb.xbm
#-----------------------------
<URL:http://www.perl.com>
#-----------------------------
@URLs = ($message =~ /<URL:(.*?)>/g);
#-----------------------------
|
#----------------------------- # download the following standalone program #!/usr/bin/perl -w -p00 # text2html - trivial html encoding of normal text # -p means apply this script to each record. # -00 mean that a record is now a paragraph use HTML::Entities; $_ = encode_entities($_, "\200-\377"); if (/^\s/) { # Paragraphs beginning with whitespace are wrapped in <PRE> s{(.*)$} {<PRE>\n$1</PRE>\n}s; # indented verbatim } else { s{^(>.*)} {$1<BR>}gm; # quoted text s{<URL:(.*?)>} {<A HREF="$1">$1</A>}gs # embedded URL (good) || s{(http:\S+)} {<A HREF="$1">$1</A>}gs; # guessed URL (bad) s{\*(\S+)\*} {<STRONG>$1</STRONG>}g; # this is *bold* here s{\b_(\S+)\_\b} {<EM>$1</EM>}g; # this is _italics_ here s{^} {<P>\n}; # add paragraph tag } #----------------------------- BEGIN { print "<TABLE>"; $_ = encode_entities(scalar <>); s/\n\s+/ /g; # continuation lines while ( /^(\S+?:)\s*(.*)$/gm ) { # parse heading print "<TR><TH ALIGN='LEFT'>$1</TH><TD>$2</TD></TR>\n"; } print "</TABLE><HR>"; } #----------------------------- |
#----------------------------- $ascii = `lynx -dump $filename`; #----------------------------- use HTML::FormatText; use HTML::Parse; $html = parse_htmlfile($filename); $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 50); $ascii = $formatter->format($html); #----------------------------- use HTML::TreeBuilder; use HTML::FormatText; $html = HTML::TreeBuilder->new(); $html->parse($document); $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 50); $ascii = $formatter->format($html); #----------------------------- |
#-----------------------------
($plain_text = $html_text) =~ s/<[^>]*>//gs; #WRONG
#-----------------------------
use HTML::Parse;
use HTML::FormatText;
$plain_text = HTML::FormatText->new->format(parse_html($html_text));
#-----------------------------
#% perl -pe 's/<[^>]*>//g' file
#-----------------------------
#<IMG SRC = "foo.gif"
# ALT = "Flurp!">
#-----------------------------
#% perl -0777 -pe 's/<[^>]*>//gs' file
#-----------------------------
{
local $/; # temporary whole-file input mode
$html = <FILE>;
$html =~ s/<[^>]*>//gs;
}
#-----------------------------
#<IMG SRC = "foo.gif" ALT = "A > B">
#
#<!-- <A comment> -->
#
#<script>if (a<b && a>c)</script>
#
#<# Just data #>
#
#<![INCLUDE CDATA [ >>>>>>>>>>>> ]]>
#-----------------------------
#<!-- This section commented out.
# <B>You can't see me!</B>
#-->
#-----------------------------
package MyParser;
use HTML::Parser;
use HTML::Entities qw(decode_entities);
@ISA = qw(HTML::Parser);
sub text {
my($self, $text) = @_;
print decode_entities($text);
}
package main;
MyParser->new->parse_file(*F);
#-----------------------------
($title) = ($html =~ m#<TITLE>\s*(.*?)\s*</TITLE>#is);
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# htitle - get html title from URL
die "usage: $0 url ...\n" unless @ARGV;
require LWP;
foreach $url (@ARGV) {
$ua = LWP::UserAgent->new();
$res = $ua->request(HTTP::Request->new(GET => $url));
print "$url: " if @ARGV > 1;
if ($res->is_success) {
print $res->title, "\n";
} else {
print $res->status_line, "\n";
}
}
#-----------------------------
#% htitle http://www.ora.com
#www.oreilly.com -- Welcome to O'Reilly & Associates!
#
#% htitle http://www.perl.com/ http://www.perl.com/nullvoid
#http://www.perl.com/: The www.perl.com Home Page
#http://www.perl.com/nullvoid: 404 File Not Found
#-----------------------------
|
#----------------------------- # download the following standalone program #!/usr/bin/perl -w # churl - check urls use HTML::LinkExtor; use LWP::Simple qw(get head); $base_url = shift or die "usage: $0 <start_url>\n"; $parser = HTML::LinkExtor->new(undef, $base_url); $parser->parse(get($base_url)); @links = $parser->links; print "$base_url: \n"; foreach $linkarray (@links) { my @element = @$linkarray; my $elt_type = shift @element; while (@element) { my ($attr_name , $attr_value) = splice(@element, 0, 2); if ($attr_value->scheme =~ /\b(ftp|https?|file)\b/) { print " $attr_value: ", head($attr_value) ? "OK" : "BAD", "\n"; } } } #----------------------------- #% churl http://www.wizards.com #http://www.wizards.com: # # FrontPage/FP_Color.gif: OK # # FrontPage/FP_BW.gif: BAD # # #FP_Map: OK # # Games_Library/Welcome.html: OK #----------------------------- |
#----------------------------- # download the following standalone program #!/usr/bin/perl -w # surl - sort URLs by their last modification date use LWP::UserAgent; use HTTP::Request; use URI::URL qw(url); my($url, %Date); my $ua = LWP::UserAgent->new(); while ( $url = url(scalar <>) ) { my($req, $ans); next unless $url->scheme =~ /^(file|https?)$/; $ans = $ua->request(HTTP::Request->new("HEAD", $url)); if ($ans->is_success) { $Date{$url} = $ans->last_modified || 0; # unknown } else { print STDERR "$url: Error [", $ans->code, "] ", $ans->message, "!\n"; } } foreach $url ( sort { $Date{$b} <=> $Date{$a} } keys %Date ) { printf "%-25s %s\n", $Date{$url} ? (scalar localtime $Date{$url}) : "<NONE SPECIFIED>", $url; } #----------------------------- #% xurl http://www.perl.com/ | surl | head #Mon Apr 20 06:16:02 1998 http://electriclichen.com/linux/srom.html # #Fri Apr 17 13:38:51 1998 http://www.oreilly.com/ # #Fri Mar 13 12:16:47 1998 http://www2.binevolve.com/ # #Sun Mar 8 21:01:27 1998 http://www.perl.org/ # #Tue Nov 18 13:41:32 1997 http://www.perl.com/universal/header.map # #Wed Oct 1 12:55:13 1997 http://www.songline.com/ # #Sun Aug 17 21:43:51 1997 http://www.perl.com/graphics/perlhome_header.jpg # #Sun Aug 17 21:43:47 1997 http://www.perl.com/graphics/perl_id_313c.gif # #Sun Aug 17 21:43:46 1997 http://www.perl.com/graphics/ora_logo.gif # #Sun Aug 17 21:43:44 1997 http://www.perl.com/graphics/header-nav.gif #----------------------------- |
#-----------------------------
sub template {
my ($filename, $fillings) = @_;
my $text;
local $/; # slurp mode (undef)
local *F; # create local filehandle
open(F, "< $filename\0") || return;
$text = <F>; # read whole file
close(F); # ignore retval
# replace quoted words with value in %$fillings hash
$text =~ s{ %% ( .*? ) %% }
{ exists( $fillings->{$1} )
? $fillings->{$1}
: ""
}gsex;
return $text;
}
#-----------------------------
#<!-- simple.template for internal template() function -->
#<HTML><HEAD><TITLE>Report for %%username%%</TITLE></HEAD>
#<BODY><H1>Report for %%username%%</H1>
#%%username%% logged in %%count%% times, for a total of %%total%% minutes.
#-----------------------------
#<!-- fancy.template for Text::Template -->
#<HTML><HEAD><TITLE>Report for {$user}</TITLE></HEAD>
#<BODY><H1>Report for {$user}</H1>
#{ lcfirst($user) } logged in {$count} times, for a total of
#{ int($total / 60) } minutes.
#-----------------------------
%fields = (
username => $whats_his_name,
count => $login_count,
total => $minute_used,
);
print template("/home/httpd/templates/simple.template", \%fields);
#-----------------------------
# download the following standalone program
#!/usr/bin/perl -w
# userrep1 - report duration of user logins using SQL database
use DBI;
use CGI qw(:standard);
# template() defined as in the Solution section above
$user = param("username") or die "No username";
$dbh = DBI->connect("dbi:mysql:connections:mysql.domain.com:3306",
"connections", "seekritpassword") or die "Couldn't connect\n";
$sth = $dbh->prepare(<<"END_OF_SELECT") or die "Couldn't prepare SQL";
SELECT COUNT(duration),SUM(duration)
FROM logins WHERE username='$user'
END_OF_SELECT
# this time the duration is assumed to be in seconds
if (@row = $sth->fetchrow()) {
($count, $seconds) = @row;
} else {
($count, $seconds) = (0,0);
}
$sth->finish();
$dbh->disconnect;
print header();
print template("report.tpl", {
'username' => $user,
'count' => $count,
'total' => $total
});
#-----------------------------
You owe: {$total}
#-----------------------------
The average was {$count ? ($total/$count) : 0}.
#-----------------------------
# download the following standalone program
#!/usr/bin/perl -w
# userrep2 - report duration of user logins using SQL database
use Text::Template;
use DBI;
use CGI qw(:standard);
$tmpl = "/home/httpd/templates/fancy.template";
$template = Text::Template->new(-type => "file", -source => $tmpl);
$user = param("username") or die "No username";
$dbh = DBI->connect("dbi:mysql:connections:mysql.domain.com:3306",
"connections", "secret passwd") or die "Couldn't db connect\n";
$sth = $dbh->prepare(<<"END_OF_SELECT") or die "Couldn't prepare SQL";
SELECT COUNT(duration),SUM(duration)
FROM logins WHERE username='$user'
END_OF_SELECT
$sth->execute() or die "Couldn't execute SQL";
if (@row = $sth->fetchrow()) {
($count, $total) = @row;
} else {
$count = $total = 0;
}
$sth->finish();
$dbh->disconnect;
print header();
print $template->fill_in();
#-----------------------------
|
#----------------------------- use LWP::Simple; mirror($URL, $local_filename); #----------------------------- |
#-----------------------------
use LWP::RobotUA;
$ua = LWP::RobotUA->new('websnuffler/0.1', 'me@wherever.com');
#-----------------------------
403 (Forbidden) Forbidden by robots.txt
#-----------------------------
#% GET http://www.webtechniques.com/robots.txt
#User-agent: *
#
# Disallow: /stats
#
# Disallow: /db
#
# Disallow: /logs
#
# Disallow: /store
#
# Disallow: /forms
#
# Disallow: /gifs
#
# Disallow: /wais-src
#
# Disallow: /scripts
#
# Disallow: /config
#-----------------------------
#% GET http://www.cnn.com/robots.txt | head
## robots, scram
#
## $I d : robots.txt,v 1.2 1998/03/10 18:27:01 mreed Exp $
#
#User-agent: *
#
#Disallow: /
#
#User-agent: Mozilla/3.01 (hotwired-test/0.1)
#
#Disallow: /cgi-bin
#
#Disallow: /TRANSCRIPTS
#
#Disallow: /development
#-----------------------------
|
#-----------------------------
while (<LOGFILE>) {
my ($client, $identuser, $authuser, $date, $time, $tz, $method,
$url, $protocol, $status, $bytes) =
/^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+) "(\S+) (.*?) (\S+)" (\S+) (\S+)$/ or next;
# ...
}
#-----------------------------
|
#----------------------------- # download the following standalone program #!/usr/bin/perl -w # sumwww - summarize web server log activity $lastdate = ""; daily_logs(); summary(); exit; # read CLF files and tally hits from the host and to the URL sub daily_logs { while (<>) { ($type, $what) = /"(GET|POST)\s+(\S+?) \S+"/ or next; ($host, undef, undef, $datetime) = split; ($bytes) = /\s(\d+)\s*$/ or next; ($date) = ($datetime =~ /\[([^:]*)/); $posts += ($type eq POST); $home++ if m, / ,; if ($date ne $lastdate) { if ($lastdate) { write_report() } else { $lastdate = $date } } $count++; $hosts{$host}++; $what{$what}++; $bytesum += $bytes; } write_report() if $count; } # use *typeglob aliasing of global variables for cheap copy sub summary { $lastdate = "Grand Total"; *count = *sumcount; *bytesum = *bytesumsum; *hosts = *allhosts; *posts = *allposts; *what = *allwhat; *home = *allhome; write; } # display the tallies of hosts and URLs, using formats sub write_report { write; # add to summary data $lastdate = $date; $sumcount += $count; $bytesumsum += $bytesum; $allposts += $posts; $allhome += $home; # reset daily data $posts = $count = $bytesum = $home = 0; @allwhat{keys %what} = keys %what; @allhosts{keys %hosts} = keys %hosts; %hosts = %what = (); } format STDOUT_TOP = @|||||||||| @|||||| @||||||| @||||||| @|||||| @|||||| @||||||||||||| "Date", "Hosts", "Accesses", "Unidocs", "POST", "Home", "Bytes" ----------- ------- -------- -------- ------- ------- -------------- . format STDOUT = @>>>>>>>>>> @>>>>>> @>>>>>>> @>>>>>>> @>>>>>> @>>>>>> @>>>>>>>>>>>>> $lastdate, scalar(keys %hosts), $count, scalar(keys %what), $posts, $home, $bytesum . #----------------------------- # Date Hosts Accesses Unidocs POST Home Bytes # # ----------- ------- -------- -------- ------- ------- -------------- # # 19/May/1998 353 6447 3074 352 51 16058246 # # 20/May/1998 1938 23868 4288 972 350 61879643 # # 21/May/1998 1775 27872 6596 1064 376 64613798 # # 22/May/1998 1680 21402 4467 735 285 52437374 # # 23/May/1998 1128 21260 4944 592 186 55623059 # # Grand Total 6050 100849 10090 3715 1248 250612120 #----------------------------- # download the following standalone program #!/usr/bin/perl -w # aprept - report on Apache logs use Logfile::Apache; $l = Logfile::Apache->new( File => "-", # STDIN Group => [ Domain, File ]); $l->report(Group => Domain, Sort => Records); $l->report(Group => File, List => [Bytes,Records]); #----------------------------- # Domain Records # # =============================== # # US Commercial 222 38.47% # # US Educational 115 19.93% # # Network 93 16.12% # # Unresolved 54 9.36% # # Australia 48 8.32% # # Canada 20 3.47% # # Mexico 8 1.39% # # United Kingdom 6 1.04% # # # File Bytes Records # # ========================================================= # # / 13008 0.89% 6 1.04% # # /cgi-bin/MxScreen 11870 0.81% 2 0.35% # # /cgi-bin/pickcards 39431 2.70% 48 8.32% # # /deckmaster 143793 9.83% 21 3.64% # # /deckmaster/admin 54447 3.72% 3 0.52% #----------------------------- |
#----------------------------- #<HTML><HEAD><TITLE>Hi!</TITLE></HEAD><BODY> #<H1>Welcome to Scooby World!</H1> #I have <A HREF="pictures.html">pictures</A> of the crazy dog #himself. Here's one!<P> #<IMG SRC="scooby.jpg" ALT="Good doggy!"><P> #<BLINK>He's my hero!</BLINK> I would like to meet him some day, #and get my picture taken with him.<P> #P.S. I am deathly ill. <A HREF="shergold.html">Please send #cards</A>. #</BODY></HTML> #----------------------------- #% htmlsub picture photo scooby.html #<HTML><HEAD><TITLE>Hi!</TITLE></HEAD><BODY> # #<H1>Welcome to Scooby World!</H1> # #I have <A HREF="pictures.html">photos</A> of the crazy dog # #himself. Here's one!<P> # #<IMG SRC="scooby.jpg" ALT="Good doggy!"><P> # #<BLINK>He's my hero!</BLINK> I would like to meet him some day, # #and get my photo taken with him.<P> # #P.S. I am deathly ill. <A HREF="shergold.html">Please send # #cards</A>. # #</BODY></HTML> #----------------------------- # download the following standalone program #!/usr/bin/perl -w # htmlsub - make substitutions in normal text of HTML files # from Gisle Aas <gisle@aas.no> sub usage { die "Usage: $0 <from> <to> <file>...\n" } my $from = shift or usage; my $to = shift or usage; usage unless @ARGV; # Build the HTML::Filter subclass to do the substituting. package MyFilter; require HTML::Filter; @ISA=qw(HTML::Filter); use HTML::Entities qw(decode_entities encode_entities); sub text { my $self = shift; my $text = decode_entities($_[0]); $text =~ s/\Q$from/$to/go; # most important line $self->SUPER::text(encode_entities($text)); } # Now use the class. package main; foreach (@ARGV) { MyFilter->new->parse_file($_); } #----------------------------- |
#----------------------------- #% hrefsub shergold.html cards.html scooby.html #<HTML><HEAD><TITLE>Hi!</TITLE></HEAD><BODY> # #<H1>Welcome to Scooby World!</H1> # #I have <A HREF="pictures.html">pictures</A> of the crazy dog # #himself. Here's one!<P> # #<IMG SRC="scooby.jpg" ALT="Good doggy!"><P> # #<BLINK>He's my hero!</BLINK> I would like to meet him some day, # #and get my picture taken with him.<P> # #P.S. I am deathly ill. <a href="cards.html">Please send # #cards</A>. # #</BODY></HTML> #----------------------------- # download the following standalone program #!/usr/bin/perl -w # hrefsub - make substitutions in <A HREF="..."> fields of HTML files # from Gisle Aas <gisle@aas.no> sub usage { die "Usage: $0 <from> <to> <file>...\n" } my $from = shift or usage; my $to = shift or usage; usage unless @ARGV; # The HTML::Filter subclass to do the substitution. package MyFilter; require HTML::Filter; @ISA=qw(HTML::Filter); use HTML::Entities qw(encode_entities); sub start { my($self, $tag, $attr, $attrseq, $orig) = @_; if ($tag eq 'a' && exists $attr->{href}) { if ($attr->{href} =~ s/\Q$from/$to/g) { # must reconstruct the start tag based on $tag and $attr. # wish we instead were told the extent of the 'href' value # in $orig. my $tmp = "<$tag"; for (@$attrseq) { my $encoded = encode_entities($attr->{$_}); $tmp .= qq( $_="$encoded "); } $tmp .= ">"; $self->output($tmp); return; } } $self->output($orig); } # Now use the class. package main; foreach (@ARGV) { MyFilter->new->parse_file($_); } #----------------------------- |