#!/usr/bin/perl -w use LWP::Simple; use SOAP::Lite; use Digest::MD5 qw( md5_hex ); use YAML qw( LoadFile DumpFile ); use IO::File; use strict; my $key = shift(@ARGV) or die "usage: $0 []\n"; my $type = "rtf"; my @words = qw( the is of and ); my $cat_file = shift(@ARGV) || "catalog.yaml"; my ($catalog, $seen); if (-r $cat_file) { warn "* Loading catalog from $cat_file ...\n"; $catalog = LoadFile( $cat_file ); $seen->{$catalog->{$_}} = $_ for keys %$catalog; } else { warn "Can't load catalog from $cat_file. Creating a new one...\n"; $catalog = {}; } my $start = 0; # $catalog->{START} || 0; my $done = 0; my $word; $SIG{INT} = $SIG{HUP} = $SIG{TERM} = sub { $done++ }; warn "* Initiating Google search service...\n"; my $google = SOAP::Lite->service("http://api.google.com/GoogleSearch.wsdl"); until ($done) { if (not $word or $start >= 1000) { # Google doesn't return results > 1000 $word = shift @words; $start = 0; if ($word) { warn "* Now using search term '$word'\n"; } else { warn "* Run out of search terms! Done.\n"; exit; } } warn "* Querying Google for results $start + ...\n"; # key, q, start, maxResults, filter, restrict, safeSearch, # lr, ie, oe my @params = ($key, "filetype:$type +$word", $start, 10, 0, '', 0, '', '', ''); my $result = $google->doGoogleSearch(@params); for my $item (@{$result->{resultElements}}) { last if $done; # someone hit the stop button my $url = $item->{URL}; # make sure it's RTF next unless $url =~ /\.$type$/o; if ($seen->{$url}) { # already have it. warn "= $url\n"; next; } warn "+ $url\n"; my $data = get( $url ); unless ($data) { warn "Can't load $url?\n"; next; } my $md5 = md5_hex( $data ); $md5 = substr($md5, 0, 16); # leave somewhat manageable filenames my $file = "$md5.$type"; if (-r "$file") { # Already have it. warn "| $url = $file\n"; next; } my $fh = IO::File->new(">$file"); unless ($fh) { warn "Can't write to $file??\n"; next; } warn " -> $file\n"; $fh->print($data); $fh->close; $catalog->{$md5} = $url; $seen->{$url} = $md5; } warn "* Writing catalog...\n"; DumpFile( $cat_file, $catalog ); $start += 10 unless $done; }