|
WIth a
little modification to our script, keyword_url_list,
we can go one step further and retrieve the html. To perform
this extraction, we can use an av_tool
called av_getpage which takes the arc filename and
offset of a given archived document and returns the document.
These are fields V and g according to the legend. If we take
a space, " " as a delimiter, then we need fields
10 and 14. Once we have filtered out the relevant fields,
we can redirect our output to a file. Type av_getpage --help
for more assistance.
#!/usr/bin/perl
use strict;
my
$i = 0;
my $drive;
my $host;
my $index;
my $filename;
my $offset;
`rm
shark_index shark_html`;
foreach $host(glob "/net/ia001*"){ #iterate through
each host
print
"searching host $host\n";
for($drive = 0; $drive < 4; $drive ++){ # iterate through
each drive
print
" searching drive $drive\n";
`/local/home/brad/bin_search -all "shark" $host/$drive/crawl.cdx.gz
| cut -f10,14 -d" " >> ~/shark_index`;
foreach $index(`/local/home/brad/bin_search -all ""
shark_index`){
($offset, $filename) = split(" ", $index);
print "offset = $offset\n";
print "filename = $filename\n";
# #add .arc.gz to each filename
$filename = $filename.".arc.gz";
print "filename = $filename\n";
`/local/home/brad/av_getpage $host/$drive/$filename
$offset >> ~/shark.arc`;
}
}
}
Note:
shark.arc is in arc_format. You have created an ARC
file.
|