|
|
Click Below To Visit
AllAmericanCock.com
|
|
|
#!/usr/bin/perl5
use CGI;
use LWP::Simple;
use HTML::TokeParser;
$cgiobject=new CGI;
$cgiobject->use_named_parameters;
print $cgiobject->header;
print $cgiobject->start_html
(-title=>'Page Parser',
-bgcolor=>'white');
#
#
# print $cgiobject->startform
# (-method=>'get',
# -action=>'parsepage22w.cgi');
# print "URL to Analyze:".$cgiobject->textfield
# (-name=>'url',
# -size=>'40');
# print " ".$cgiobject->submit(-value=>'Analyze');
# print $cgiobject->endform;
# print " ";
$QS = $ENV{QUERY_STRING};
# print $QS;
# print "";
# retrieve web page
my $url = shift;
$fetchURL=$cgiobject->param("$url");
# $fetchurl = $QS;
unless ($fetchURL)
{$fetchURL="http://www.cnn.com"}
$webPage=get($QS);
print <$QS
has been analysed
ENDHTML
&remove_links;
# &parse_text;
# &parse_title;
# &parse_meta_description;
# &parse_meta_keywords;
# &parse_images;
# &parse_hyperlinks;
# print $cgiobject->end_html;
sub remove_links{
$webText = "text here for web page";
$webText = $webPage;
my $stream = HTML::TokeParser->new(\$webText);
while (my $token = $stream->get_token)
{
if ($token->[0] eq 'S') {
# start of a TAG
if ($token->[1] eq 'a') {
# IMG TAG Beginning
### print $token->[2]{'alt'} || '';
# print $token->[1]; # will this do it.
### print "test";
$stream->get_token; # throw away the next token, hopefully text.
$stream->get_token; # throw away the next token, hopefully text.
$stream->get_token; # throw away the next token, hopefully text.
}
else {
# NEW SECTION to KILL TITLE
if ($token->[1] eq 'title') {
# this is a title
### print $token->[2]{'rt'} || '';
### print "removed_t";
$stream->get_token; # throw away the next token
$stream->get_token; # throw away the next token
}
else {
if ($token->[1] eq 'meta') {
# this is a meta tag
### print $token->[2]{'rt'} || '';
### print "removed_met";
$stream->get_token; # throw away the next token
# $stream->get_token; # throw away the next token
}
else {
if ($token->[1] eq 'script') {
# this is a script tag
### print $token->[2]{'rt'} || '';
### print "removed_scr";
$stream->get_token; # throw away the next token
$stream->get_token; # throw away the next token
}
else {
if ($token->[1] eq 'style') {
# this is a style tag
### print $token->[2]{'rt'} || '';
### print "removed_sty";
$stream->get_token; # throw away the next token
$stream->get_token; # throw away the next token
}
else {
#### print $token->[4]; # Start of Tag is not an IMG
}
} # needed
}
}
}
}
elsif($token->[0] eq 'E' ) { # print $token->[2];
}
elsif($token->[0] eq 'T' ) {
if ($token->[1] eq 'CNN') {
$stream->get_token;
# throw away the next token
print "CZZ";
}
else
{
###### print $token->[1];
# keep text
##### print "-uk-";
$latest_token = $token->[1];
$bucket .= $latest_token;
# print "bucket=" $bucket;
}
# }
}
elsif($token->[0] eq 'C' ) { print $token->[1] }
elsif($token->[0] eq 'D' ) { print $token->[1] }
elsif($token->[0] eq 'PI'){ print $token->[2] }
}
print "| ";
print $bucket;
print " |
";
}
sub parse_text{
# $webText = "text here for web page";
# $webText = $webPage;
# $webText = $webText2; # output from link remover goes to number 2
$webText =~ s/[\r\n]//g; # remove CRs LFs
$webText =~ s/<[ ]*(script|style).*(\/script|\/style)[ ]*>//gi; # multiline tag styles
$webText =~ s/<[^<>][^<>]*>//gi; # remove Html (ie anything in typical start/end token tag styles
$webText =~ s/ / /gi; # substitite tabs (use actual tab)
$webText =~ s/ +/ /gi; # remove all extra spaces
$webText =~ s/\ //gi;
print "Page Text $webText ";
}
sub parse_title{
#parse and output page title
$parser=HTML::TokeParser->new(\$webPage);
$parser->get_tag("title");
print "Page title ".
$parser->get_trimmed_text."";
}
sub parse_meta_keywords{
#parse and output meta data
$parser=HTML::TokeParser->new(\$webPage);
while (my $token=$parser->get_tag("meta"))
{ if ($token->[1]{name}=~/keywords/i)
{ print "Meta Keywords ".
$token->[1]{content}."" }
}
}
sub parse_meta_description{
#parse and output meta data
$parser=HTML::TokeParser->new(\$webPage);
while (my $token=$parser->get_tag("meta"))
{ if ($token->[1]{name}=~/description/i)
{ print "Meta Description ".
$token->[1]{content}."" }
}
}
sub parse_images{
#parse and count images
$parser=HTML::TokeParser->new(\$webPage);
my $imageTotal=0;
while ($parser->get_tag("img"))
{ $imageTotal++ }
print "Image Count ".
"Total = $imageTotal";
}
sub parse_hyperlinks{
#parse and output hyperlinks
$parser=HTML::TokeParser->new(\$webPage);
print "Hyperlink Summary";
while (my $token = $parser->get_tag("a"))
{ my $linkURL = $token->[1]{href} || "-";
my $linkText = $parser->get_trimmed_text("/a");
if ($linkText=~/$linkText ".
"links to $linkURL "
}
}
|
|
|