Craig's List

 
Advertisement      Next   
Click Below To Visit
no banner plugin yet
Details Add a Site      Modify This Listing   
#!/usr/bin/perl5 use CGI; use LWP::Simple; use HTML::TokeParser; $cgiobject=new CGI; $cgiobject->use_named_parameters; print $cgiobject->header; print $cgiobject->start_html (-title=>'Page Parser', -bgcolor=>'white'); # # # print $cgiobject->startform # (-method=>'get', # -action=>'parsepage22w.cgi'); # print "URL to Analyze:".$cgiobject->textfield # (-name=>'url', # -size=>'40'); # print "
".$cgiobject->submit(-value=>'Analyze'); # print $cgiobject->endform; # print "
"; $QS = $ENV{QUERY_STRING}; # print $QS; # print "

"; # retrieve web page my $url = shift; $fetchURL=$cgiobject->param("$url"); # $fetchurl = $QS; unless ($fetchURL) {$fetchURL="http://www.cnn.com"} $webPage=get($QS); print <

$QS
has been analysed

ENDHTML &remove_links; # &parse_text; # &parse_title; # &parse_meta_description; # &parse_meta_keywords; # &parse_images; # &parse_hyperlinks; # print $cgiobject->end_html; sub remove_links{ $webText = "text here for web page"; $webText = $webPage; my $stream = HTML::TokeParser->new(\$webText); while (my $token = $stream->get_token) { if ($token->[0] eq 'S') { # start of a TAG if ($token->[1] eq 'a') { # IMG TAG Beginning ### print $token->[2]{'alt'} || ''; # print $token->[1]; # will this do it. ### print "test"; $stream->get_token; # throw away the next token, hopefully text. $stream->get_token; # throw away the next token, hopefully text. $stream->get_token; # throw away the next token, hopefully text. } else { # NEW SECTION to KILL TITLE if ($token->[1] eq 'title') { # this is a title ### print $token->[2]{'rt'} || ''; ### print "removed_t"; $stream->get_token; # throw away the next token $stream->get_token; # throw away the next token } else { if ($token->[1] eq 'meta') { # this is a meta tag ### print $token->[2]{'rt'} || ''; ### print "removed_met"; $stream->get_token; # throw away the next token # $stream->get_token; # throw away the next token } else { if ($token->[1] eq 'script') { # this is a script tag ### print $token->[2]{'rt'} || ''; ### print "removed_scr"; $stream->get_token; # throw away the next token $stream->get_token; # throw away the next token } else { if ($token->[1] eq 'style') { # this is a style tag ### print $token->[2]{'rt'} || ''; ### print "removed_sty"; $stream->get_token; # throw away the next token $stream->get_token; # throw away the next token } else { #### print $token->[4]; # Start of Tag is not an IMG } } # needed } } } } elsif($token->[0] eq 'E' ) { # print $token->[2]; } elsif($token->[0] eq 'T' ) { if ($token->[1] eq 'CNN') { $stream->get_token; # throw away the next token print "CZZ"; } else { ###### print $token->[1]; # keep text ##### print "-uk-"; $latest_token = $token->[1]; $bucket .= $latest_token; # print "bucket=" $bucket; } # } } elsif($token->[0] eq 'C' ) { print $token->[1] } elsif($token->[0] eq 'D' ) { print $token->[1] } elsif($token->[0] eq 'PI'){ print $token->[2] } } print "
"; print $bucket; print "

"; } sub parse_text{ # $webText = "text here for web page"; # $webText = $webPage; # $webText = $webText2; # output from link remover goes to number 2 $webText =~ s/[\r\n]//g; # remove CRs LFs $webText =~ s/<[ ]*(script|style).*(\/script|\/style)[ ]*>//gi; # multiline tag styles $webText =~ s/<[^<>][^<>]*>//gi; # remove Html (ie anything in typical start/end token tag styles $webText =~ s/ / /gi; # substitite tabs (use actual tab) $webText =~ s/ +/ /gi; # remove all extra spaces $webText =~ s/\ //gi; print "

Page Text

$webText

"; } sub parse_title{ #parse and output page title $parser=HTML::TokeParser->new(\$webPage); $parser->get_tag("title"); print "

Page title

". $parser->get_trimmed_text."

"; } sub parse_meta_keywords{ #parse and output meta data $parser=HTML::TokeParser->new(\$webPage); while (my $token=$parser->get_tag("meta")) { if ($token->[1]{name}=~/keywords/i) { print "

Meta Keywords

". $token->[1]{content}."

" } } } sub parse_meta_description{ #parse and output meta data $parser=HTML::TokeParser->new(\$webPage); while (my $token=$parser->get_tag("meta")) { if ($token->[1]{name}=~/description/i) { print "

Meta Description

". $token->[1]{content}."

" } } } sub parse_images{ #parse and count images $parser=HTML::TokeParser->new(\$webPage); my $imageTotal=0; while ($parser->get_tag("img")) { $imageTotal++ } print "

Image Count

". "Total = $imageTotal

"; } sub parse_hyperlinks{ #parse and output hyperlinks $parser=HTML::TokeParser->new(\$webPage); print "

Hyperlink Summary

"; while (my $token = $parser->get_tag("a")) { my $linkURL = $token->[1]{href} || "-"; my $linkText = $parser->get_trimmed_text("/a"); if ($linkText=~/$linkText ". "links to $linkURL
" } }