Cruise Patrol

  Cruise Patrol
Cruise Patrol

 
Site Banner Previous      Next   
Click Below To Visit
Click Here to get in on the Ride
Click Here to get in on the Ride
Details Add a Site      Modify This Listing   
#!/usr/bin/perl5 use CGI; use LWP::Simple; use HTML::TokeParser; $cgiobject=new CGI; $cgiobject->use_named_parameters; print $cgiobject->header; print $cgiobject->start_html (-title=>'Page Parser', -bgcolor=>'white'); # # # print $cgiobject->startform # (-method=>'get', # -action=>'parsepage22w.cgi'); # print "URL to Analyze:".$cgiobject->textfield # (-name=>'url', # -size=>'40'); # print "
".$cgiobject->submit(-value=>'Analyze'); # print $cgiobject->endform; # print "
"; $QS = $ENV{QUERY_STRING}; # print $QS; # print "

"; # retrieve web page my $url = shift; $fetchURL=$cgiobject->param("$url"); # $fetchurl = $QS; unless ($fetchURL) {$fetchURL="http://www.cnn.com"} $webPage=get($QS); print <

$QS
has been analysed

ENDHTML &remove_links; # &parse_text; # &parse_title; # &parse_meta_description; # &parse_meta_keywords; # &parse_images; # &parse_hyperlinks; # print $cgiobject->end_html; sub remove_links{ $webText = "text here for web page"; $webText = $webPage; my $stream = HTML::TokeParser->new(\$webText); while (my $token = $stream->get_token) { if ($token->[0] eq 'S') { # start of a TAG if ($token->[1] eq 'a') { # IMG TAG Beginning ### print $token->[2]{'alt'} || ''; # print $token->[1]; # will this do it. ### print "test"; $stream->get_token; # throw away the next token, hopefully text. $stream->get_token; # throw away the next token, hopefully text. $stream->get_token; # throw away the next token, hopefully text. } else { # NEW SECTION to KILL TITLE if ($token->[1] eq 'title') { # this is a title ### print $token->[2]{'rt'} || ''; ### print "removed_t"; $stream->get_token; # throw away the next token $stream->get_token; # throw away the next token } else { if ($token->[1] eq 'meta') { # this is a meta tag ### print $token->[2]{'rt'} || ''; ### print "removed_met"; $stream->get_token; # throw away the next token # $stream->get_token; # throw away the next token } else { if ($token->[1] eq 'script') { # this is a script tag ### print $token->[2]{'rt'} || ''; ### print "removed_scr"; $stream->get_token; # throw away the next token $stream->get_token; # throw away the next token } else { if ($token->[1] eq 'style') { # this is a style tag ### print $token->[2]{'rt'} || ''; ### print "removed_sty"; $stream->get_token; # throw away the next token $stream->get_token; # throw away the next token } else { #### print $token->[4]; # Start of Tag is not an IMG } } # needed } } } } elsif($token->[0] eq 'E' ) { # print $token->[2]; } elsif($token->[0] eq 'T' ) { if ($token->[1] eq 'CNN') { $stream->get_token; # throw away the next token print "CZZ"; } else { ###### print $token->[1]; # keep text ##### print "-uk-"; $latest_token = $token->[1]; $bucket .= $latest_token; # print "bucket=" $bucket; } # } } elsif($token->[0] eq 'C' ) { print $token->[1] } elsif($token->[0] eq 'D' ) { print $token->[1] } elsif($token->[0] eq 'PI'){ print $token->[2] } } print "
"; print $bucket; print "

"; } sub parse_text{ # $webText = "text here for web page"; # $webText = $webPage; # $webText = $webText2; # output from link remover goes to number 2 $webText =~ s/[\r\n]//g; # remove CRs LFs $webText =~ s/<[ ]*(script|style).*(\/script|\/style)[ ]*>//gi; # multiline tag styles $webText =~ s/<[^<>][^<>]*>//gi; # remove Html (ie anything in typical start/end token tag styles $webText =~ s/ / /gi; # substitite tabs (use actual tab) $webText =~ s/ +/ /gi; # remove all extra spaces $webText =~ s/\ //gi; print "

Page Text

$webText

"; } sub parse_title{ #parse and output page title $parser=HTML::TokeParser->new(\$webPage); $parser->get_tag("title"); print "

Page title

". $parser->get_trimmed_text."

"; } sub parse_meta_keywords{ #parse and output meta data $parser=HTML::TokeParser->new(\$webPage); while (my $token=$parser->get_tag("meta")) { if ($token->[1]{name}=~/keywords/i) { print "

Meta Keywords

". $token->[1]{content}."

" } } } sub parse_meta_description{ #parse and output meta data $parser=HTML::TokeParser->new(\$webPage); while (my $token=$parser->get_tag("meta")) { if ($token->[1]{name}=~/description/i) { print "

Meta Description

". $token->[1]{content}."

" } } } sub parse_images{ #parse and count images $parser=HTML::TokeParser->new(\$webPage); my $imageTotal=0; while ($parser->get_tag("img")) { $imageTotal++ } print "

Image Count

". "Total = $imageTotal

"; } sub parse_hyperlinks{ #parse and output hyperlinks $parser=HTML::TokeParser->new(\$webPage); print "

Hyperlink Summary

"; while (my $token = $parser->get_tag("a")) { my $linkURL = $token->[1]{href} || "-"; my $linkText = $parser->get_trimmed_text("/a"); if ($linkText=~/$linkText ". "links to $linkURL
" } }