source: trunk/third/xscreensaver/hacks/webcollage @ 20148

Revision 20148, 73.4 KB checked in by ghudson, 21 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r20147, which included commits to RCS files with non-trunk default branches.
  • Property svn:executable set to *
Line 
1#!/usr/bin/perl -w
2#
3# webcollage, Copyright (c) 1999-2003 by Jamie Zawinski <jwz@jwz.org>
4# This program decorates the screen with random images from the web.
5# One satisfied customer described it as "a nonstop pop culture brainbath."
6#
7# Permission to use, copy, modify, distribute, and sell this software and its
8# documentation for any purpose is hereby granted without fee, provided that
9# the above copyright notice appear in all copies and that both that
10# copyright notice and this permission notice appear in supporting
11# documentation.  No representations are made about the suitability of this
12# software for any purpose.  It is provided "as is" without express or
13# implied warranty.
14
15
16# To run this as a display mode with xscreensaver, add this to `programs':
17#
18#     webcollage -root
19#     webcollage -root -filter 'vidwhacker -stdin -stdout'
20
21
22# If you have the "driftnet" program installed, webcollage can display a
23# collage of images sniffed off your local ethernet, instead of pulled out
24# of search engines: in that way, your screensaver can display the images
25# that your co-workers are downloading!
26#
27# Driftnet is available here: http://www.ex-parrot.com/~chris/driftnet/
28# Use it like so:
29#
30#     webcollage -root -driftnet
31#
32# Driftnet is the Unix implementation of the MacOS "EtherPEG" program.
33
34
35require 5;
36use strict;
37
38# We can't "use diagnostics" here, because that library malfunctions if
39# you signal and catch alarms: it says "Uncaught exception from user code"
40# and exits, even though I damned well AM catching it!
41#use diagnostics;
42
43
44use Socket;
45require Time::Local;
46require POSIX;
47use Fcntl ':flock'; # import LOCK_* constants
48use POSIX qw(strftime);
49
50use bytes;  # Larry can take Unicode and shove it up his ass sideways.
51            # Perl 5.8.0 causes us to start getting incomprehensible
52            # errors about UTF-8 all over the place without this.
53
54
55my $progname = $0; $progname =~ s@.*/@@g;
56my $version = q{ $Revision: 1.1.1.2 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/;
57my $copyright = "WebCollage $version, Copyright (c) 1999-2002" .
58    " Jamie Zawinski <jwz\@jwz.org>\n" .
59    "            http://www.jwz.org/xscreensaver/\n";
60
61
62
63my @search_methods = (  77, "altavista",  \&pick_from_alta_vista_random_link,
64                        14, "yahoorand",  \&pick_from_yahoo_random_link,
65                         9, "yahoonews",  \&pick_from_yahoo_news_text,
66
67                     # Alta Vista has a new "random link" URL now.
68                     # They added it specifically to better support webcollage!
69                     # That was super cool of them.  This is how we used to do
70                     # it, before:
71                     #
72                     #  0, "avimages", \&pick_from_alta_vista_images,
73                     #  0, "avtext",   \&pick_from_alta_vista_text,
74
75                     # Google asked (nicely) for me to stop searching them.
76                     # I asked them to add a "random link" url.  They said
77                     # "that would be easy, we'll think about it" and then
78                     # never wrote back.  Booo Google!  Booooo!
79                     #
80                     #   0, "googlenums", \&pick_from_google_image_numbers,
81                     #   0, "googleimgs", \&pick_from_google_images,
82
83                     # I suspect Hotbot is actually the same search engine
84                     # data as Lycos.
85                     #
86                     #  0, "hotbot",     \&pick_from_hotbot_text,
87
88                     # Eh, Lycos sucks anyway.
89                     #   0, "lycos",      \&pick_from_lycos_text,
90                      );
91
92# programs we can use to write to the root window (tried in ascending order.)
93#
94my @root_displayers = (
95  "xscreensaver-getimage -root -file",
96  "chbg       -once -xscreensaver -max_size 100",
97  "xv         -root -quit -viewonly +noresetroot -quick24 -rmode 5" .
98  "           -rfg black -rbg black",
99  "xli        -quiet -onroot -center -border black",
100  "xloadimage -quiet -onroot -center -border black",
101
102# this lame program wasn't built with vroot.h:
103# "xsri       -scale -keep-aspect -center-horizontal -center-vertical",
104);
105
106
107# Some sites need cookies to work properly.   These are they.
108#
109my %cookies = (
110  "www.altavista.com"  =>  "AV_ALL=1",   # request uncensored searches
111  "web.altavista.com"  =>  "AV_ALL=1",
112
113                                         # log in as "cipherpunk"
114  "www.nytimes.com"    =>  'NYT-S=18cHMIlJOn2Y1bu5xvEG3Ufuk6E1oJ.' .
115                           'FMxWaQV0igaB5Yi/Q/guDnLeoL.pe7i1oakSb' .
116                           '/VqfdUdb2Uo27Vzt1jmPn3cpYRlTw9',
117);
118
119
120# If this is set, it's a helper program to use for pasting images together:
121# this is a lot faster and more efficient than using PPM pipelines, which is
122# what we do if this program doesn't exist.  (We check for "webcollage-helper"
123# on $PATH at startup, and set this variable appropriately.)
124#
125my $webcollage_helper = undef;
126
127
128# If we have the webcollage-helper program, then it will paste the images
129# together with transparency!  0.0 is invisible, 1.0 is totally opaque.
130#
131my $opacity = 0.85;
132
133
134# Some sites have  managed to poison the search engines.  These are they.
135# (We auto-detect sites that have poisoned the search engines via excessive
136# keywords or dictionary words,  but these are ones that slip through
137# anyway.)
138#
139# This can contain full host names, or 2 or 3 component domains.
140#
141my %poisoners = (
142  "die.net"                 => 1,  # 'l33t h4ck3r d00dz.
143  "genforum.genealogy.com"  => 1,  # Cluttering avtext with human names.
144  "rootsweb.com"            => 1,  # Cluttering avtext with human names.
145  "akamai.net"              => 1,  # Lots of sites have their images on Akamai.
146  "akamaitech.net"          => 1,  # But those are pretty much all banners.
147                                   # Since Akamai is super-expensive, let's
148                                   # go out on a limb and assume that all of
149                                   # their customers are rich-and-boring.
150  "bartleby.com"            => 1,  # Dictionary, cluttering avtext.
151  "encyclopedia.com"        => 1,  # Dictionary, cluttering avtext.
152  "onlinedictionary.datasegment.com" => 1,  # Dictionary, cluttering avtext.
153  "hotlinkpics.com"         => 1,  # Porn site that has poisoned avimages
154                                   # (I don't see how they did it, though!)
155  "alwayshotels.com"        => 1,  # Poisoned Lycos pretty heavily.
156  "nextag.com"              => 1,  # Poisoned Alta Vista real good.
157);
158
159
160# When verbosity is turned on, we warn about sites that we seem to be hitting
161# a lot: usually this means some new poisoner has made it into the search
162# engines.  But sometimes, the warning is just because that site has a lot
163# of stuff on it.  So these are the sites that are immune to the "frequent
164# site" diagnostic message.
165#
166my %warningless_sites = (
167  "home.earthlink.net"      => 1,  # Lots of home pages here.
168  "www.geocities.com"       => 1,
169  "www.angelfire.com"       => 1,
170  "members.aol.com"         => 1,
171
172  "yimg.com"                => 1,  # This is where dailynews.yahoo.com stores
173  "eimg.com"                => 1,  # its images, so pick_from_yahoo_news_text()
174                                   # hits this every time.
175
176  "driftnet"                => 1,  # builtin...
177);
178
179
180##############################################################################
181#
182# Various global flags set by command line parameters, or computed
183#
184##############################################################################
185
186
187my $current_state = "???";      # for diagnostics
188my $load_method;
189my $last_search;
190my $image_succeeded = -1;
191my $suppress_audit = 0;
192
193my $verbose_imgmap = 0;         # print out rectangles and URLs only (stdout)
194my $verbose_warnings = 0;       # print out warnings when things go wrong
195my $verbose_load = 0;           # diagnostics about loading of URLs
196my $verbose_filter = 0;         # diagnostics about page selection/rejection
197my $verbose_net = 0;            # diagnostics about network I/O
198my $verbose_pbm = 0;            # diagnostics about PBM pipelines
199my $verbose_http = 0;           # diagnostics about all HTTP activity
200my $verbose_exec = 0;           # diagnostics about executing programs
201
202my $report_performance_interval = 60 * 15;  # print some stats every 15 minutes
203
204my $http_proxy = undef;
205my $http_timeout = 30;
206my $cvt_timeout = 10;
207
208my $min_width = 50;
209my $min_height = 50;
210my $min_ratio = 1/5;
211
212my $min_gif_area = (120 * 120);
213
214
215my $no_output_p = 0;
216my $urls_only_p = 0;
217
218my @pids_to_kill = ();  # forked pids we should kill when we exit, if any.
219
220my $driftnet_magic = 'driftnet';
221my $driftnet_dir = undef;
222my $default_driftnet_cmd = "driftnet -a -m 100";
223
224my $wordlist;
225
226my %rejected_urls;
227my @tripwire_words = ("aberrate", "abode", "amorphous", "antioch",
228                      "arrhenius", "arteriole", "blanket", "brainchild",
229                      "burdensome", "carnival", "cherub", "chord", "clever",
230                      "dedicate", "dilogarithm", "dolan", "dryden",
231                      "eggplant");
232
233
234##############################################################################
235#
236# Retrieving URLs
237#
238##############################################################################
239
240# returns three values: the HTTP response line; the document headers;
241# and the document body.
242#
243sub get_document_1 {
244  my ( $url, $referer, $timeout ) = @_;
245
246  if (!defined($timeout)) { $timeout = $http_timeout; }
247  if ($timeout > $http_timeout) { $timeout = $http_timeout; }
248
249  if ($timeout <= 0) {
250    LOG (($verbose_net || $verbose_load), "timed out for $url");
251    return ();
252  }
253
254  LOG ($verbose_net, "get_document_1 $url " . ($referer ? $referer : ""));
255
256  if (! ($url =~ m@^http://@i)) {
257    LOG ($verbose_net, "not an HTTP URL: $url");
258    return ();
259  }
260
261  my ($url_proto, $dummy, $serverstring, $path) = split(/\//, $url, 4);
262  $path = "" unless $path;
263
264  my ($them,$port) = split(/:/, $serverstring);
265  $port = 80 unless $port;
266
267  my $them2 = $them;
268  my $port2 = $port;
269  if ($http_proxy) {
270    $serverstring = $http_proxy if $http_proxy;
271    $serverstring =~ s@^[a-z]+://@@;
272    ($them2,$port2) = split(/:/, $serverstring);
273    $port2 = 80 unless $port2;
274  }
275
276  my ($remote, $iaddr, $paddr, $proto, $line);
277  $remote = $them2;
278  if ($port2 =~ /\D/) { $port2 = getservbyname($port2, 'tcp') }
279  if (!$port2) {
280    LOG (($verbose_net || $verbose_load), "unrecognised port in $url");
281    return ();
282  }
283  $iaddr   = inet_aton($remote);
284  if (!$iaddr) {
285    LOG (($verbose_net || $verbose_load), "host not found: $remote");
286    return ();
287  }
288  $paddr   = sockaddr_in($port2, $iaddr);
289
290
291  my $head = "";
292  my $body = "";
293
294  @_ =
295    eval {
296      local $SIG{ALRM} = sub {
297        LOG (($verbose_net || $verbose_load), "timed out ($timeout) for $url");
298        die "alarm\n";
299      };
300      alarm $timeout;
301
302      $proto   = getprotobyname('tcp');
303      if (!socket(S, PF_INET, SOCK_STREAM, $proto)) {
304        LOG (($verbose_net || $verbose_load), "socket: $!");
305        return ();
306      }
307      if (!connect(S, $paddr)) {
308        LOG (($verbose_net || $verbose_load), "connect($serverstring): $!");
309        return ();
310      }
311
312      select(S); $| = 1; select(STDOUT);
313
314      my $cookie = $cookies{$them};
315
316      my $user_agent = "$progname/$version";
317
318      if ($url =~ m@^http://www\.altavista\.com/@ ||
319          $url =~ m@^http://random\.yahoo\.com/@) {
320        # block this, you turkeys.
321        $user_agent = "Mozilla/4.76 [en] (X11; U; Linux 2.2.16-22 i686; Nav)";
322      }
323
324      my $hdrs = "GET " . ($http_proxy ? $url : "/$path") . " HTTP/1.0\r\n" .
325                 "Host: $them\r\n" .
326                 "User-Agent: $user_agent\r\n";
327      if ($referer) {
328        $hdrs .= "Referer: $referer\r\n";
329      }
330      if ($cookie) {
331        my @cc = split(/\r?\n/, $cookie);
332        $hdrs .= "Cookie: " . join('; ', @cc) . "\r\n";
333      }
334      $hdrs .= "\r\n";
335
336      foreach (split('\r?\n', $hdrs)) {
337        LOG ($verbose_http, "  ==> $_");
338      }
339      print S $hdrs;
340      my $http = <S> || "";
341
342      # Kludge: the Yahoo Random Link is now returning as its first
343      # line "Status: 301" instead of "HTTP/1.0 301 Found".  Fix it...
344      #
345      $http =~ s@^Status:\s+(\d+)\b@HTTP/1.0 $1@i;
346
347      $_  = $http;
348      s/[\r\n]+$//s;
349      LOG ($verbose_http, "  <== $_");
350
351      while (<S>) {
352        $head .= $_;
353        s/[\r\n]+$//s;
354        last if m@^$@;
355        LOG ($verbose_http, "  <== $_");
356
357        if (m@^Set-cookie:\s*([^;\r\n]+)@i) {
358          set_cookie($them, $1)
359        }
360      }
361
362      my $lines = 0;
363      while (<S>) {
364        $body .= $_;
365        $lines++;
366      }
367
368      LOG ($verbose_http,
369           "  <== [ body ]: $lines lines, " . length($body) . " bytes");
370
371      close S;
372
373      if (!$http) {
374        LOG (($verbose_net || $verbose_load), "null response: $url");
375        return ();
376      }
377
378      return ( $http, $head, $body );
379    };
380  die if ($@ && $@ ne "alarm\n");       # propagate errors
381  if ($@) {
382    # timed out
383    $head = undef;
384    $body = undef;
385    $suppress_audit = 1;
386    return ();
387  } else {
388    # didn't
389    alarm 0;
390    return @_;
391  }
392}
393
394
395# returns two values: the document headers; and the document body.
396# if the given URL did a redirect, returns the redirected-to document.
397#
398sub get_document {
399  my ( $url, $referer, $timeout ) = @_;
400  my $start = time;
401
402  if (defined($referer) && $referer eq $driftnet_magic) {
403    return get_driftnet_file ($url);
404  }
405
406  my $orig_url = $url;
407  my $loop_count = 0;
408  my $max_loop_count = 4;
409
410  do {
411    if (defined($timeout) && $timeout <= 0) {
412      LOG (($verbose_net || $verbose_load), "timed out for $url");
413      $suppress_audit = 1;
414      return ();
415    }
416
417    my ( $http, $head, $body ) = get_document_1 ($url, $referer, $timeout);
418
419    if (defined ($timeout)) {
420      my $now = time;
421      my $elapsed = $now - $start;
422      $timeout -= $elapsed;
423      $start = $now;
424    }
425
426    return () unless $http; # error message already printed
427
428    $http =~ s/[\r\n]+$//s;
429
430    if ( $http =~ m@^HTTP/[0-9.]+ 30[123]@ ) {
431      $_ = $head;
432
433      my ( $location ) = m@^location:[ \t]*(.*)$@im;
434      if ( $location ) {
435        $location =~ s/[\r\n]$//;
436
437        LOG ($verbose_net, "redirect from $url to $location");
438        $referer = $url;
439        $url = $location;
440
441        if ($url =~ m@^/@) {
442          $referer =~ m@^(http://[^/]+)@i;
443          $url = $1 . $url;
444        } elsif (! ($url =~ m@^[a-z]+:@i)) {
445          $_ = $referer;
446          s@[^/]+$@@g if m@^http://[^/]+/@i;
447          $_ .= "/" if m@^http://[^/]+$@i;
448          $url = $_ . $url;
449        }
450
451      } else {
452        LOG ($verbose_net, "no Location with \"$http\"");
453        return ( $url, $body );
454      }
455
456      if ($loop_count++ > $max_loop_count) {
457        LOG ($verbose_net,
458             "too many redirects ($max_loop_count) from $orig_url");
459        $body = undef;
460        return ();
461      }
462
463    } elsif ( $http =~ m@^HTTP/[0-9.]+ ([4-9][0-9][0-9].*)$@ ) {
464
465      LOG (($verbose_net || $verbose_load), "failed: $1 ($url)");
466
467      # http errors -- return nothing.
468      $body = undef;
469      return ();
470
471    } elsif (!$body) {
472
473      LOG (($verbose_net || $verbose_load), "document contains no data: $url");
474      return ();
475
476    } else {
477
478      # ok!
479      return ( $url, $body );
480    }
481
482  } while (1);
483}
484
485# If we already have a cookie defined for this site, and the site is trying
486# to overwrite that very same cookie, let it do so.  This is because nytimes
487# expires its cookies - it lets you upgrade to a new cookie without logging
488# in again, but you have to present the old cookie to get the new cookie.
489# So, by doing this, the built-in cypherpunks cookie will never go "stale".
490#
491sub set_cookie {
492  my ($host, $cookie) = @_;
493  my $oc = $cookies{$host};
494  return unless $oc;
495  $_ = $oc;
496  my ($oc_name, $oc_value) = m@^([^= \t\r\n]+)=(.*)$@;
497  $_ = $cookie;
498  my ($nc_name, $nc_value) = m@^([^= \t\r\n]+)=(.*)$@;
499
500  if ($oc_name eq $nc_name &&
501      $oc_value ne $nc_value) {
502    $cookies{$host} = $cookie;
503    LOG ($verbose_net, "overwrote ${host}'s $oc_name cookie");
504  }
505}
506
507
508############################################################################
509#
510# Extracting image URLs from HTML
511#
512############################################################################
513
514# given a URL and the body text at that URL, selects and returns a random
515# image from it.  returns () if no suitable images found.
516#
517sub pick_image_from_body {
518  my ( $url, $body ) = @_;
519
520  my $base = $url;
521  $_ = $url;
522
523  # if there's at least one slash after the host, take off the last
524  # pathname component
525  if ( m@^http://[^/]+/@io ) {
526    $base =~ s@[^/]+$@@go;
527  }
528
529  # if there are no slashes after the host at all, put one on the end.
530  if ( m@^http://[^/]+$@io ) {
531    $base .= "/";
532  }
533
534  $_ = $body;
535
536  # strip out newlines, compress whitespace
537  s/[\r\n\t ]+/ /go;
538
539  # nuke comments
540  s/<!--.*?-->//go;
541
542
543  # There are certain web sites that list huge numbers of dictionary
544  # words in their bodies or in their <META NAME=KEYWORDS> tags (surprise!
545  # Porn sites tend not to be reputable!)
546  #
547  # I do not want webcollage to filter on content: I want it to select
548  # randomly from the set of images on the web.  All the logic here for
549  # rejecting some images is really a set of heuristics for rejecting
550  # images that are not really images: for rejecting *text* that is in
551  # GIF/JPEG form.  I don't want text, I want pictures, and I want the
552  # content of the pictures to be randomly selected from among all the
553  # available content.
554  #
555  # So, filtering out "dirty" pictures by looking for "dirty" keywords
556  # would be wrong: dirty pictures exist, like it or not, so webcollage
557  # should be able to select them.
558  #
559  # However, picking a random URL is a hard thing to do.  The mechanism I'm
560  # using is to search for a selection of random words.  This is not
561  # perfect, but works ok most of the time.  The way it breaks down is when
562  # some URLs get precedence because their pages list *every word* as
563  # related -- those URLs come up more often than others.
564  #
565  # So, after we've retrieved a URL, if it has too many keywords, reject
566  # it.  We reject it not on the basis of what those keywords are, but on
567  # the basis that by having so many, the page has gotten an unfair
568  # advantage against our randomizer.
569  #
570  my $trip_count = 0;
571  foreach my $trip (@tripwire_words) {
572    $trip_count++ if m/$trip/i;
573  }
574
575  if ($trip_count >= $#tripwire_words - 2) {
576    LOG (($verbose_filter || $verbose_load),
577         "there is probably a dictionary in \"$url\": rejecting.");
578    $rejected_urls{$url} = -1;
579    $body = undef;
580    $_ = undef;
581    return ();
582  }
583
584
585  my @urls;
586  my %unique_urls;
587
588  foreach (split(/ *</)) {
589    if ( m/^meta /i ) {
590
591      # Likewise, reject any web pages that have a KEYWORDS meta tag
592      # that is too long.
593      #
594      if (m/name ?= ?\"?keywords\"?/i &&
595          m/content ?= ?\"([^\"]+)\"/) {
596        my $L = length($1);
597        if ($L > 1000) {
598          LOG (($verbose_filter || $verbose_load),
599               "excessive keywords ($L bytes) in $url: rejecting.");
600          $rejected_urls{$url} = $L;
601          $body = undef;
602          $_ = undef;
603          return ();
604        } else {
605          LOG ($verbose_filter, "  keywords ($L bytes) in $url (ok)");
606        }
607      }
608
609    } elsif ( m/^(img|a) .*(src|href) ?= ?\"? ?(.*?)[ >\"]/io ) {
610
611      my $was_inline = (! ( "$1" eq "a" || "$1" eq "A" ));
612      my $link = $3;
613      my ( $width )  = m/width ?=[ \"]*(\d+)/oi;
614      my ( $height ) = m/height ?=[ \"]*(\d+)/oi;
615      $_ = $link;
616
617      if ( m@^/@o ) {
618        my $site;
619        ( $site = $base ) =~ s@^(http://[^/]*).*@$1@gio;
620        $_ = "$site$link";
621      } elsif ( ! m@^[^/:?]+:@ ) {
622        $_ = "$base$link";
623        s@/\./@/@g;
624        1 while (s@/[^/]+/\.\./@/@g);
625      }
626
627      # skip non-http
628      if ( ! m@^http://@io ) {
629        next;
630      }
631
632      # skip non-image
633      if ( ! m@[.](gif|jpg|jpeg|pjpg|pjpeg)$@io ) {
634        next;
635      }
636
637      # skip really short or really narrow images
638      if ( $width && $width < $min_width) {
639        if (!$height) { $height = "?"; }
640        LOG ($verbose_filter, "  skip narrow image $_ (${width}x$height)");
641        next;
642      }
643
644      if ( $height && $height < $min_height) {
645        if (!$width) { $width = "?"; }
646        LOG ($verbose_filter, "  skip short image $_ (${width}x$height)");
647        next;
648      }
649
650      # skip images with ratios that make them look like banners.
651      if ($min_ratio && $width && $height &&
652          ($width * $min_ratio ) > $height) {
653        if (!$height) { $height = "?"; }
654        LOG ($verbose_filter, "  skip bad ratio $_ (${width}x$height)");
655        next;
656      }
657
658      # skip GIFs with a small number of pixels -- those usually suck.
659      if ($width && $height &&
660          m/\.gif$/io &&
661          ($width * $height) < $min_gif_area) {
662        LOG ($verbose_filter, "  skip small GIF $_ (${width}x$height)");
663        next;
664      }
665     
666
667      my $url = $_;
668
669      if ($unique_urls{$url}) {
670        LOG ($verbose_filter, "  skip duplicate image $_");
671        next;
672      }
673
674      LOG ($verbose_filter,
675           "  image $url" .
676           ($width && $height ? " (${width}x${height})" : "") .
677           ($was_inline ? " (inline)" : ""));
678
679      $urls[++$#urls] = $url;
680      $unique_urls{$url}++;
681
682      # jpegs are preferable to gifs.
683      $_ = $url;
684      if ( ! m@[.]gif$@io ) {
685        $urls[++$#urls] = $url;
686      }
687
688      # pointers to images are preferable to inlined images.
689      if ( ! $was_inline ) {
690        $urls[++$#urls] = $url;
691        $urls[++$#urls] = $url;
692      }
693    }
694  }
695
696  my $fsp = ($body =~ m@<frameset@i);
697
698  $_ = undef;
699  $body = undef;
700
701  @urls = depoison (@urls);
702
703  if ( $#urls < 0 ) {
704    LOG ($verbose_load, "no images on $base" . ($fsp ? " (frameset)" : ""));
705    return ();
706  }
707
708  # pick a random element of the table
709  my $i = int(rand($#urls+1));
710  $url = $urls[$i];
711
712  LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#urls+1) . ": $url");
713
714  return $url;
715}
716
717
718
719############################################################################
720#
721# Subroutines for getting pages and images out of search engines
722#
723############################################################################
724
725
726sub pick_dictionary {
727  my @dicts = ("/usr/dict/words",
728               "/usr/share/dict/words",
729               "/usr/share/lib/dict/words");
730  foreach my $f (@dicts) {
731    if (-f $f) {
732      $wordlist = $f;
733      last;
734    }
735  }
736  error ("$dicts[0] does not exist") unless defined($wordlist);
737}
738
739# returns a random word from the dictionary
740#
741sub random_word {
742    my $word = 0;
743    if (open (IN, "<$wordlist")) {
744        my $size = (stat(IN))[7];
745        my $pos = rand $size;
746        if (seek (IN, $pos, 0)) {
747            $word = <IN>;   # toss partial line
748            $word = <IN>;   # keep next line
749        }
750        if (!$word) {
751          seek( IN, 0, 0 );
752          $word = <IN>;
753        }
754        close (IN);
755    }
756
757    return 0 if (!$word);
758
759    $word =~ s/^[ \t\n\r]+//;
760    $word =~ s/[ \t\n\r]+$//;
761    $word =~ s/ys$/y/;
762    $word =~ s/ally$//;
763    $word =~ s/ly$//;
764    $word =~ s/ies$/y/;
765    $word =~ s/ally$/al/;
766    $word =~ s/izes$/ize/;
767    $word =~ tr/A-Z/a-z/;
768
769    if ( $word =~ s/[ \t\n\r]/\+/g ) {  # convert intra-word spaces to "+".
770      $word = "\%22$word\%22";          # And put quotes (%22) around it.
771    }
772
773    return $word;
774}
775
776sub random_words {
777  my ($or_p) = @_;
778  my $sep = ($or_p ? "%20OR%20" : "%20");
779  return (random_word . $sep .
780          random_word . $sep .
781          random_word . $sep .
782          random_word . $sep .
783          random_word);
784}
785
786
787sub url_quote {
788  my ($s) = @_;
789  $s =~ s|([^-a-zA-Z0-9.\@/_\r\n])|sprintf("%%%02X", ord($1))|ge;
790  return $s;
791}
792
793sub url_unquote {
794  my ($s) = @_;
795  $s =~ s/[+]/ /g;
796  $s =~ s/%([a-z0-9]{2})/chr(hex($1))/ige;
797  return $s;
798}
799
800
801# Loads the given URL (a search on some search engine) and returns:
802# - the total number of hits the search engine claimed it had;
803# - a list of URLs from the page that the search engine returned;
804# Note that this list contains all kinds of internal search engine
805# junk URLs too -- caller must prune them.
806#
807sub pick_from_search_engine {
808  my ( $timeout, $search_url, $words ) = @_;
809
810  $_ = $words;
811  s/%20/ /g;
812
813  print STDERR "\n\n" if ($verbose_load);
814
815  LOG ($verbose_load, "words: $_");
816  LOG ($verbose_load, "URL: $search_url");
817
818  $last_search = $search_url;   # for warnings
819
820  my $start = time;
821  my ( $base, $body ) = get_document ($search_url, undef, $timeout);
822  if (defined ($timeout)) {
823    $timeout -= (time - $start);
824    if ($timeout <= 0) {
825      $body = undef;
826      LOG (($verbose_net || $verbose_load),
827           "timed out (late) for $search_url");
828      $suppress_audit = 1;
829      return ();
830    }
831  }
832
833  return () if (! $body);
834
835
836  my @subpages;
837
838  my $search_count = "?";
839  if ($body =~ m@found (approximately |about )?(<B>)?(\d+)(</B>)? image@) {
840    $search_count = $3;
841  } elsif ($body =~ m@<NOBR>((\d{1,3})(,\d{3})*)&nbsp;@i) {
842    $search_count = $1;
843  } elsif ($body =~ m@found ((\d{1,3})(,\d{3})*|\d+) Web p@) {
844    $search_count = $1;
845  } elsif ($body =~ m@found about ((\d{1,3})(,\d{3})*|\d+) results@) {
846    $search_count = $1;
847  } elsif ($body =~ m@\b\d+ - \d+ of (\d+)\b@i) { # avimages
848    $search_count = $1;
849  } elsif ($body =~ m@About ((\d{1,3})(,\d{3})*) images@i) { # avimages
850    $search_count = $1;
851  } elsif ($body =~ m@We found ((\d{1,3})(,\d{3})*|\d+) results@i) { # *vista
852    $search_count = $1;
853  } elsif ($body =~ m@ of about <B>((\d{1,3})(,\d{3})*)<@i) { # googleimages
854    $search_count = $1;
855  } elsif ($body =~ m@<B>((\d{1,3})(,\d{3})*)</B> Web sites were found@i) {
856    $search_count = $1;    # lycos
857  } elsif ($body =~ m@WEB.*?RESULTS.*?\b((\d{1,3})(,\d{3})*)\b.*?Matches@i) {
858    $search_count = $1;                          # hotbot
859  } elsif ($body =~ m@no photos were found containing@i) { # avimages
860    $search_count = "0";
861  } elsif ($body =~ m@found no document matching@i) { # avtext
862    $search_count = "0";
863  }
864  1 while ($search_count =~ s/^(\d+)(\d{3})/$1,$2/);
865
866#  if ($search_count eq "?" || $search_count eq "0") {
867#    local *OUT;
868#    my $file = "/tmp/wc.html";
869#    open(OUT, ">$file") || error ("writing $file: $!");
870#    print OUT $body;
871#    close OUT;
872#    print STDERR  blurb() . "###### wrote $file\n";
873#  }
874
875
876  my $length = length($body);
877  my $href_count = 0;
878
879  $_ = $body;
880
881  s/[\r\n\t ]+/ /g;
882
883
884  s/(<A )/\n$1/gi;
885  foreach (split(/\n/)) {
886    $href_count++;
887    my ($u) = m@<A\s.*\bHREF\s*=\s*([^>]+)>@i;
888    next unless $u;
889
890    if ($u =~ m/^\"([^\"]*)\"/) { $u = $1; }   # quoted string
891    elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; }  # or token
892
893    if ( $rejected_urls{$u} ) {
894      LOG ($verbose_filter, "  pre-rejecting candidate: $u");
895      next;
896    }
897
898    LOG ($verbose_http, "    HREF: $u");
899
900    $subpages[++$#subpages] = $u;
901  }
902
903  if ( $#subpages < 0 ) {
904    LOG ($verbose_filter,
905         "found nothing on $base ($length bytes, $href_count links).");
906    return ();
907  }
908
909  LOG ($verbose_filter, "" . $#subpages+1 . " links on $search_url");
910
911  return ($search_count, @subpages);
912}
913
914
915sub depoison {
916  my (@urls) = @_;
917  my @urls2 = ();
918  foreach (@urls) {
919    my ($h) = m@^http://([^/: \t\r\n]+)@i;
920
921    next unless defined($h);
922
923    if ($poisoners{$h}) {
924      LOG (($verbose_filter), "  rejecting poisoner: $_");
925      next;
926    }
927    if ($h =~ m@([^.]+\.[^.]+\.[^.]+)$@ &&
928        $poisoners{$1}) {
929      LOG (($verbose_filter), "  rejecting poisoner: $_");
930      next;
931    }
932    if ($h =~ m@([^.]+\.[^.]+)$@ &&
933        $poisoners{$1}) {
934      LOG (($verbose_filter), "  rejecting poisoner: $_");
935      next;
936    }
937
938    push @urls2, $_;
939  }
940  return @urls2;
941}
942
943
944# given a list of URLs, picks one at random; loads it; and returns a
945# random image from it.
946# returns the url of the page loaded; the url of the image chosen;
947# and a debugging description string.
948#
949sub pick_image_from_pages {
950  my ($base, $total_hit_count, $unfiltered_link_count, $timeout, @pages) = @_;
951
952  $total_hit_count = "?" unless defined($total_hit_count);
953
954  @pages = depoison (@pages);
955  LOG ($verbose_load,
956       "" . ($#pages+1) . " candidates of $unfiltered_link_count links" .
957       " ($total_hit_count total)");
958
959  return () if ($#pages < 0);
960
961  my $i = int(rand($#pages+1));
962  my $page = $pages[$i];
963
964  LOG ($verbose_load, "picked page $page");
965
966  $suppress_audit = 1;
967
968  my ( $base2, $body2 ) = get_document ($page, $base, $timeout);
969
970  if (!$base2 || !$body2) {
971    $body2 = undef;
972    return ();
973  }
974
975  my $img = pick_image_from_body ($base2, $body2);
976  $body2 = undef;
977
978  if ($img) {
979    return ($base2, $img);
980  } else {
981    return ();
982  }
983}
984
985
986############################################################################
987#
988# Pick images from random pages returned by the Yahoo Random Link
989#
990############################################################################
991
992# yahoorand
993my $yahoo_random_link = "http://random.yahoo.com/fast/ryl";
994
995
996# Picks a random page; picks a random image on that page;
997# returns two URLs: the page containing the image, and the image.
998# Returns () if nothing found this time.
999#
1000sub pick_from_yahoo_random_link {
1001  my ( $timeout ) = @_;
1002
1003  print STDERR "\n\n" if ($verbose_load);
1004  LOG ($verbose_load, "URL: $yahoo_random_link");
1005
1006  $last_search = $yahoo_random_link;   # for warnings
1007
1008  $suppress_audit = 1;
1009
1010  my ( $base, $body ) = get_document ($yahoo_random_link, undef, $timeout);
1011  if (!$base || !$body) {
1012    $body = undef;
1013    return;
1014  }
1015
1016  LOG ($verbose_load, "redirected to: $base");
1017
1018  my $img = pick_image_from_body ($base, $body);
1019  $body = undef;
1020
1021  if ($img) {
1022    return ($base, $img);
1023  } else {
1024    return ();
1025  }
1026}
1027
1028
1029############################################################################
1030#
1031# Pick images from random pages returned by the Alta Vista Random Link
1032#
1033############################################################################
1034
1035# altavista
1036my $alta_vista_random_link = "http://www.altavista.com/image/randomlink";
1037
1038
1039# Picks a random page; picks a random image on that page;
1040# returns two URLs: the page containing the image, and the image.
1041# Returns () if nothing found this time.
1042#
1043sub pick_from_alta_vista_random_link {
1044  my ( $timeout ) = @_;
1045
1046  print STDERR "\n\n" if ($verbose_load);
1047  LOG ($verbose_load, "URL: $alta_vista_random_link");
1048
1049  $last_search = $alta_vista_random_link;   # for warnings
1050
1051  $suppress_audit = 1;
1052
1053  my ( $base, $body ) = get_document ($alta_vista_random_link,
1054                                      undef, $timeout);
1055  if (!$base || !$body) {
1056    $body = undef;
1057    return;
1058  }
1059
1060  LOG ($verbose_load, "redirected to: $base");
1061
1062  my $img = pick_image_from_body ($base, $body);
1063  $body = undef;
1064
1065  if ($img) {
1066    return ($base, $img);
1067  } else {
1068    return ();
1069  }
1070}
1071
1072
1073############################################################################
1074#
1075# Pick images by feeding random words into Alta Vista Image Search
1076#
1077############################################################################
1078
1079
1080my $alta_vista_images_url = "http://www.altavista.com/image/results" .
1081                            "?ipht=1" .       # photos
1082                            "&igrph=1" .      # graphics
1083                            "&iclr=1" .       # color
1084                            "&ibw=1" .        # b&w
1085                            "&micat=1" .      # no partner sites
1086                            "&sc=on" .        # "site collapse"
1087                            "&q=";
1088
1089# avimages
1090sub pick_from_alta_vista_images {
1091  my ( $timeout ) = @_;
1092
1093  my $words = random_words(0);
1094  my $page = (int(rand(9)) + 1);
1095  my $search_url = $alta_vista_images_url . $words;
1096
1097  if ($page > 1) {
1098    $search_url .= "&pgno=" . $page;            # page number
1099    $search_url .= "&stq=" . (($page-1) * 12);  # first hit result on page
1100  }
1101
1102  my ($search_hit_count, @subpages) =
1103    pick_from_search_engine ($timeout, $search_url, $words);
1104
1105  my @candidates = ();
1106  foreach my $u (@subpages) {
1107
1108    # avtext is encoding their URLs now.
1109    next unless ($u =~ m@^/r.*\&r=([^&]+).*@);
1110    $u = url_unquote($1);
1111
1112    next unless ($u =~ m@^http://@i);    #  skip non-HTTP or relative URLs
1113    next if ($u =~ m@[/.]altavista\.com\b@i);     # skip altavista builtins
1114    next if ($u =~ m@[/.]doubleclick\.net\b@i);   # you cretins
1115    next if ($u =~ m@[/.]clicktomarket\.com\b@i); # more cretins
1116
1117    next if ($u =~ m@[/.]viewimages\.com\b@i);    # stacked deck
1118    next if ($u =~ m@[/.]gettyimages\.com\b@i);
1119
1120    LOG ($verbose_filter, "  candidate: $u");
1121    push @candidates, $u;
1122  }
1123
1124  return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1125                                $timeout, @candidates);
1126}
1127
1128
1129
1130############################################################################
1131#
1132# Pick images by feeding random words into Google Image Search.
1133# By Charles Gales <gales@us.ibm.com>
1134#
1135############################################################################
1136
1137
1138my $google_images_url =     "http://images.google.com/images" .
1139                            "?site=images" .  # photos
1140                            "&btnG=Search" .  # graphics
1141                            "&safe=off" .     # no screening
1142                            "&imgsafe=off" .
1143                            "&q=";
1144
1145# googleimgs
1146sub pick_from_google_images {
1147  my ( $timeout ) = @_;
1148
1149  my $words = random_word;   # only one word for Google
1150  my $page = (int(rand(9)) + 1);
1151  my $num = 20;     # 20 images per page
1152  my $search_url = $google_images_url . $words;
1153
1154  if ($page > 1) {
1155    $search_url .= "&start=" . $page*$num;      # page number
1156    $search_url .= "&num="   . $num;            #images per page
1157  }
1158
1159  my ($search_hit_count, @subpages) =
1160    pick_from_search_engine ($timeout, $search_url, $words);
1161
1162  my @candidates = ();
1163  foreach my $u (@subpages) {
1164    next unless ($u =~ m@imgres\?imgurl@i);    #  All pics start with this
1165    next if ($u =~ m@[/.]google\.com\b@i);     # skip google builtins
1166
1167    if ($u =~ m@^/imgres\?imgurl=(.*?)\&imgrefurl=(.*?)\&@) {
1168      my $urlf = $2;
1169      LOG ($verbose_filter, "  candidate: $urlf");
1170      push @candidates, $urlf;
1171    }
1172  }
1173
1174  return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1175                                $timeout, @candidates);
1176}
1177
1178
1179
1180############################################################################
1181#
1182# Pick images by feeding random *numbers* into Google Image Search.
1183# By jwz, suggested by from Ian O'Donnell.
1184#
1185############################################################################
1186
1187
1188# googlenums
1189sub pick_from_google_image_numbers {
1190  my ( $timeout ) = @_;
1191
1192  my $max = 9999;
1193  my $number = int(rand($max));
1194
1195  $number = sprintf("%04d", $number)
1196    if (rand() < 0.3);
1197
1198  my $words = "$number";
1199  my $page = (int(rand(40)) + 1);
1200  my $num = 20;     # 20 images per page
1201  my $search_url = $google_images_url . $words;
1202
1203  if ($page > 1) {
1204    $search_url .= "&start=" . $page*$num;      # page number
1205    $search_url .= "&num="   . $num;            #images per page
1206  }
1207
1208  my ($search_hit_count, @subpages) =
1209    pick_from_search_engine ($timeout, $search_url, $words);
1210
1211  my @candidates = ();
1212  my %referers;
1213  foreach my $u (@subpages) {
1214    next unless ($u =~ m@imgres\?imgurl@i);    #  All pics start with this
1215    next if ($u =~ m@[/.]google\.com\b@i);     # skip google builtins
1216
1217    if ($u =~ m@^/imgres\?imgurl=(.*?)\&imgrefurl=(.*?)\&@) {
1218      my $ref = $2;
1219      my $img = "http://$1";
1220
1221      LOG ($verbose_filter, "  candidate: $ref");
1222      push @candidates, $img;
1223      $referers{$img} = $ref;
1224    }
1225  }
1226
1227  @candidates = depoison (@candidates);
1228  return () if ($#candidates < 0);
1229  my $i = int(rand($#candidates+1));
1230  my $img = $candidates[$i];
1231  my $ref = $referers{$img};
1232
1233  LOG ($verbose_load, "picked image " . ($i+1) . ": $img (on $ref)");
1234  return ($ref, $img);
1235}
1236
1237
1238
1239############################################################################
1240#
1241# Pick images by feeding random words into Alta Vista Text Search
1242#
1243############################################################################
1244
1245
1246my $alta_vista_url = "http://www.altavista.com/web/results" .
1247                     "?pg=aq" .
1248                     "&aqmode=s" .
1249                     "&filetype=html" .
1250                     "&sc=on" .        # "site collapse"
1251                     "&nbq=50" .
1252                     "&aqo=";
1253
1254# avtext
1255sub pick_from_alta_vista_text {
1256  my ( $timeout ) = @_;
1257
1258  my $words = random_words(0);
1259  my $page = (int(rand(9)) + 1);
1260  my $search_url = $alta_vista_url . $words;
1261
1262  if ($page > 1) {
1263    $search_url .= "&pgno=" . $page;
1264    $search_url .= "&stq=" . (($page-1) * 10);
1265  }
1266
1267  my ($search_hit_count, @subpages) =
1268    pick_from_search_engine ($timeout, $search_url, $words);
1269
1270  my @candidates = ();
1271  foreach my $u (@subpages) {
1272
1273    # Those altavista fuckers are playing really nasty redirection games
1274    # these days: the filter your clicks through their site, but use
1275    # onMouseOver to make it look like they're not!  Well, it makes it
1276    # easier for us to identify search results...
1277    #
1278    next unless ($u =~ m@^/r.*\&r=([^&]+).*@);
1279    $u = url_unquote($1);
1280
1281    LOG ($verbose_filter, "  candidate: $u");
1282    push @candidates, $u;
1283  }
1284
1285  return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1286                                $timeout, @candidates);
1287}
1288
1289
1290
1291############################################################################
1292#
1293# Pick images by feeding random words into Hotbot
1294#
1295############################################################################
1296
1297my $hotbot_search_url =("http://hotbot.lycos.com/default.asp" .
1298                        "?ca=w" .
1299                        "&descriptiontype=0" .
1300                        "&imagetoggle=1" .
1301                        "&matchmode=any" .
1302                        "&nummod=2" .
1303                        "&recordcount=50" .
1304                        "&sitegroup=1" .
1305                        "&stem=1" .
1306                        "&cobrand=undefined" .
1307                        "&query=");
1308
1309sub pick_from_hotbot_text {
1310  my ( $timeout ) = @_;
1311
1312  # lycos seems to always give us back dictionaries and word lists if
1313  # we search for more than one word...
1314  #
1315  my $words = random_word();
1316
1317  my $start = int(rand(8)) * 10 + 1;
1318  my $search_url = $hotbot_search_url . $words . "&first=$start&page=more";
1319
1320  my ($search_hit_count, @subpages) =
1321    pick_from_search_engine ($timeout, $search_url, $words);
1322
1323  my @candidates = ();
1324  foreach my $u (@subpages) {
1325
1326    # Hotbot plays redirection games too
1327    next unless ($u =~ m@/director.asp\?.*\btarget=([^&]+)@);
1328    $u = url_decode($1);
1329
1330    LOG ($verbose_filter, "  candidate: $u");
1331    push @candidates, $u;
1332  }
1333
1334  return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1335                                $timeout, @candidates);
1336}
1337
1338
1339
1340############################################################################
1341#
1342# Pick images by feeding random words into Lycos
1343#
1344############################################################################
1345
1346my $lycos_search_url = "http://search.lycos.com/default.asp" .
1347                       "?lpv=1" .
1348                       "&loc=searchhp" .
1349                       "&tab=web" .
1350                       "&query=";
1351
1352sub pick_from_lycos_text {
1353  my ( $timeout ) = @_;
1354
1355  # lycos seems to always give us back dictionaries and word lists if
1356  # we search for more than one word...
1357  #
1358  my $words = random_word();
1359
1360  my $start = int(rand(8)) * 10 + 1;
1361  my $search_url = $lycos_search_url . $words . "&first=$start&page=more";
1362
1363  my ($search_hit_count, @subpages) =
1364    pick_from_search_engine ($timeout, $search_url, $words);
1365
1366  my @candidates = ();
1367  foreach my $u (@subpages) {
1368
1369    # Lycos plays redirection games.
1370    next unless ($u =~ m@^http://click.lycos.com/director.asp
1371                         .*
1372                         \btarget=([^&]+)
1373                         .*
1374                        @x);
1375    $u = url_decode($1);
1376
1377    LOG ($verbose_filter, "  candidate: $u");
1378    push @candidates, $u;
1379  }
1380
1381  return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1382                                $timeout, @candidates);
1383}
1384
1385
1386
1387############################################################################
1388#
1389# Pick images by feeding random words into news.yahoo.com
1390#
1391############################################################################
1392
1393my $yahoo_news_url = "http://search.news.yahoo.com/search/news" .
1394                     "?a=1" .
1395                     "&c=news_photos" .
1396                     "&s=-%24s%2C-date" .
1397                     "&n=100" .
1398                     "&o=o" .
1399                     "&2=" .
1400                     "&3=" .
1401                     "&p=";
1402
1403# yahoonews
1404sub pick_from_yahoo_news_text {
1405  my ( $timeout ) = @_;
1406
1407  my $words = random_words(0);
1408  my $search_url = $yahoo_news_url . $words;
1409
1410  my ($search_hit_count, @subpages) =
1411    pick_from_search_engine ($timeout, $search_url, $words);
1412
1413  my @candidates = ();
1414  foreach my $u (@subpages) {
1415    # only accept URLs on Yahoo's news site
1416    next unless ($u =~ m@^http://dailynews\.yahoo\.com/@i ||
1417                 $u =~ m@^http://story\.news\.yahoo\.com/@i);
1418
1419    LOG ($verbose_filter, "  candidate: $u");
1420    push @candidates, $u;
1421  }
1422
1423  return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1424                                $timeout, @candidates);
1425}
1426
1427
1428
1429
1430############################################################################
1431#
1432# Pick images by waiting for driftnet to populate a temp dir with files.
1433# Requires driftnet version 0.1.5 or later.
1434# (Driftnet is a program by Chris Lightfoot that sniffs your local ethernet
1435# for images being downloaded by others.)
1436# Driftnet/webcollage integration by jwz.
1437#
1438############################################################################
1439
1440# driftnet
1441sub pick_from_driftnet {
1442  my ( $timeout ) = @_;
1443
1444  my $id = $driftnet_magic;
1445  my $dir = $driftnet_dir;
1446  my $start = time;
1447  my $now;
1448
1449  error ("\$driftnet_dir unset?") unless ($dir);
1450  $dir =~ s@/+$@@;
1451
1452  error ("$dir unreadable") unless (-d "$dir/.");
1453
1454  $timeout = $http_timeout unless ($timeout);
1455  $last_search = $id;
1456
1457  while ($now = time, $now < $start + $timeout) {
1458    local *DIR;
1459    opendir (DIR, $dir) || error ("$dir: $!");
1460    while (my $file = readdir(DIR)) {
1461      next if ($file =~ m/^\./);
1462      $file = "$dir/$file";
1463      closedir DIR;
1464      LOG ($verbose_load, "picked file $file ($id)");
1465      return ($id, $file);
1466    }
1467    closedir DIR;
1468  }
1469  LOG (($verbose_net || $verbose_load), "timed out for $id");
1470  return ();
1471}
1472
1473
1474sub get_driftnet_file {
1475  my ($file) = @_;
1476
1477  error ("\$driftnet_dir unset?") unless ($driftnet_dir);
1478
1479  my $id = $driftnet_magic;
1480  my $re = qr/$driftnet_dir/;
1481  error ("$id: $file not in $driftnet_dir?")
1482    unless ($file =~ m@^$re@o);
1483
1484  local *IN;
1485  open (IN, $file) || error ("$id: $file: $!");
1486  my $body = '';
1487  while (<IN>) { $body .= $_; }
1488  close IN || error ("$id: $file: $!");
1489  unlink ($file) || error ("$id: $file: rm: $!");
1490  return ($id, $body);
1491}
1492
1493
1494sub spawn_driftnet {
1495  my ($cmd) = @_;
1496
1497  # make a directory to use.
1498  while (1) {
1499    my $tmp = $ENV{TEMPDIR} || "/tmp";
1500    $driftnet_dir = sprintf ("$tmp/driftcollage-%08x", rand(0xffffffff));
1501    LOG ($verbose_exec, "mkdir $driftnet_dir");
1502    last if mkdir ($driftnet_dir, 0700);
1503  }
1504
1505  if (! ($cmd =~ m/\s/)) {
1506    # if the command didn't have any arguments in it, then it must be just
1507    # a pointer to the executable.  Append the default args to it.
1508    my $dargs = $default_driftnet_cmd;
1509    $dargs =~ s/^[^\s]+//;
1510    $cmd .= $dargs;
1511  }
1512
1513  # point the driftnet command at our newly-minted private directory.
1514  #
1515  $cmd .= " -d $driftnet_dir";
1516  $cmd .= ">/dev/null" unless ($verbose_exec);
1517
1518  my $pid = fork();
1519  if ($pid < 0) { error ("fork: $!\n"); }
1520  if ($pid) {
1521    # parent fork
1522    push @pids_to_kill, $pid;
1523    LOG ($verbose_exec, "forked for \"$cmd\"");
1524  } else {
1525    # child fork
1526    nontrapping_system ($cmd) || error ("exec: $!");
1527  }
1528
1529  # wait a bit, then make sure the process actually started up.
1530  #
1531  sleep (1);
1532  error ("pid $pid failed to start \"$cmd\"")
1533    unless (1 == kill (0, $pid));
1534}
1535
1536
1537############################################################################
1538#
1539# Pick a random image in a random way
1540#
1541############################################################################
1542
1543
1544# Picks a random image on a random page, and returns two URLs:
1545# the page containing the image, and the image.
1546# Returns () if nothing found this time.
1547#
1548
1549sub pick_image {
1550  my ( $timeout ) = @_;
1551
1552  $current_state = "select";
1553  $load_method = "none";
1554
1555  my $n = int(rand(100));
1556  my $fn = undef;
1557  my $total = 0;
1558  my @rest = @search_methods;
1559
1560  while (@rest) {
1561    my $pct  = shift @rest;
1562    my $name = shift @rest;
1563    my $tfn  = shift @rest;
1564    $total += $pct;
1565    if ($total > $n && !defined($fn)) {
1566      $fn = $tfn;
1567      $current_state = $name;
1568      $load_method = $current_state;
1569    }
1570  }
1571
1572  if ($total != 100) {
1573    error ("internal error: \@search_methods totals to $total%!");
1574  }
1575
1576  record_attempt ($current_state);
1577  return $fn->($timeout);
1578}
1579
1580
1581
1582############################################################################
1583#
1584# Statistics and logging
1585#
1586############################################################################
1587
1588sub timestr {
1589  return strftime ("%H:%M:%S: ", localtime);
1590}
1591
1592sub blurb {
1593  return "$progname: " . timestr() . "$current_state: ";
1594}
1595
1596sub error {
1597  my ($err) = @_;
1598  print STDERR blurb() . "$err\n";
1599  exit 1;
1600}
1601
1602
1603my $lastlog = "";
1604
1605sub clearlog {
1606  $lastlog = "";
1607}
1608
1609sub showlog {
1610  my $head = "$progname: DEBUG: ";
1611  foreach (split (/\n/, $lastlog)) {
1612    print STDERR "$head$_\n";
1613  }
1614  $lastlog = "";
1615}
1616
1617sub LOG {
1618  my ($print, $msg) = @_;
1619  my $blurb = timestr() . "$current_state: ";
1620  $lastlog .= "$blurb$msg\n";
1621  print STDERR "$progname: $blurb$msg\n" if $print;
1622}
1623
1624
1625my %stats_attempts;
1626my %stats_successes;
1627my %stats_elapsed;
1628
1629my $last_state = undef;
1630sub record_attempt {
1631  my ($name) = @_;
1632
1633  if ($last_state) {
1634    record_failure($last_state) unless ($image_succeeded > 0);
1635  }
1636  $last_state = $name;
1637
1638  clearlog();
1639  report_performance();
1640
1641  start_timer($name);
1642  $image_succeeded = 0;
1643  $suppress_audit = 0;
1644}
1645
1646sub record_success {
1647  my ($name, $url, $base) = @_;
1648  if (defined($stats_successes{$name})) {
1649    $stats_successes{$name}++;
1650  } else {
1651    $stats_successes{$name} = 1;
1652  }
1653
1654  stop_timer ($name, 1);
1655  my $o = $current_state;
1656  $current_state = $name;
1657  save_recent_url ($url, $base);
1658  $current_state = $o;
1659  $image_succeeded = 1;
1660  clearlog();
1661}
1662
1663
1664sub record_failure {
1665  my ($name) = @_;
1666
1667  return if $image_succeeded;
1668
1669  stop_timer ($name, 0);
1670  if ($verbose_load && !$verbose_exec) {
1671
1672    if ($suppress_audit) {
1673      print STDERR "$progname: " . timestr() . "(audit log suppressed)\n";
1674      return;
1675    }
1676
1677    my $o = $current_state;
1678    $current_state = "DEBUG";
1679
1680    my $line =  "#" x 78;
1681    print STDERR "\n\n\n";
1682    print STDERR ("#" x 78) . "\n";
1683    print STDERR blurb() . "failed to get an image.  Full audit log:\n";
1684    print STDERR "\n";
1685    showlog();
1686    print STDERR ("-" x 78) . "\n";
1687    print STDERR "\n\n";
1688
1689    $current_state = $o;
1690  }
1691  $image_succeeded = 0;
1692}
1693
1694
1695
1696sub stats_of {
1697  my ($name) = @_;
1698  my $i = $stats_successes{$name};
1699  my $j = $stats_attempts{$name};
1700  $i = 0 unless $i;
1701  $j = 0 unless $j;
1702  return "" . ($j ? int($i * 100 / $j) : "0") . "%";
1703}
1704
1705
1706my $current_start_time = 0;
1707
1708sub start_timer {
1709  my ($name) = @_;
1710  $current_start_time = time;
1711
1712  if (defined($stats_attempts{$name})) {
1713    $stats_attempts{$name}++;
1714  } else {
1715    $stats_attempts{$name} = 1;
1716  }
1717  if (!defined($stats_elapsed{$name})) {
1718    $stats_elapsed{$name} = 0;
1719  }
1720}
1721
1722sub stop_timer {
1723  my ($name, $success) = @_;
1724  $stats_elapsed{$name} += time - $current_start_time;
1725}
1726
1727
1728my $last_report_time = 0;
1729sub report_performance {
1730
1731  return unless $verbose_warnings;
1732
1733  my $now = time;
1734  return unless ($now >= $last_report_time + $report_performance_interval);
1735  my $ot = $last_report_time;
1736  $last_report_time = $now;
1737
1738  return if ($ot == 0);
1739
1740  my $blurb = "$progname: " . timestr();
1741
1742  print STDERR "\n";
1743  print STDERR "${blurb}Current standings:\n";
1744
1745  foreach my $name (sort keys (%stats_attempts)) {
1746    my $try = $stats_attempts{$name};
1747    my $suc = $stats_successes{$name} || 0;
1748    my $pct = int($suc * 100 / $try);
1749    my $secs = $stats_elapsed{$name};
1750    my $secs_link = int($secs / $try);
1751    print STDERR sprintf ("$blurb   %-12s %4s (%d/%d);\t %2d secs/link\n",
1752                          "$name:", "$pct%", $suc, $try, $secs_link);
1753  }
1754}
1755
1756
1757
1758my $max_recent_images = 400;
1759my $max_recent_sites  = 20;
1760my @recent_images = ();
1761my @recent_sites = ();
1762
1763sub save_recent_url {
1764  my ($url, $base) = @_;
1765
1766  return unless ($verbose_warnings);
1767
1768  $_ = $url;
1769  my ($site) = m@^http://([^ \t\n\r/:]+)@;
1770  return unless defined ($site);
1771
1772  if ($base eq $driftnet_magic) {
1773    $site = $driftnet_magic;
1774    @recent_images = ();
1775  }
1776
1777  my $done = 0;
1778  foreach (@recent_images) {
1779    if ($_ eq $url) {
1780      print STDERR blurb() . "WARNING: recently-duplicated image: $url" .
1781        " (on $base via $last_search)\n";
1782      $done = 1;
1783      last;
1784    }
1785  }
1786
1787  # suppress "duplicate site" warning via %warningless_sites.
1788  #
1789  if ($warningless_sites{$site}) {
1790    $done = 1;
1791  } elsif ($site =~ m@([^.]+\.[^.]+\.[^.]+)$@ &&
1792           $warningless_sites{$1}) {
1793    $done = 1;
1794  } elsif ($site =~ m@([^.]+\.[^.]+)$@ &&
1795           $warningless_sites{$1}) {
1796    $done = 1;
1797  }
1798
1799  if (!$done) {
1800    foreach (@recent_sites) {
1801      if ($_ eq $site) {
1802        print STDERR blurb() . "WARNING: recently-duplicated site: $site" .
1803        " ($url on $base via $last_search)\n";
1804        last;
1805      }
1806    }
1807  }
1808
1809  push @recent_images, $url;
1810  push @recent_sites,  $site;
1811  shift @recent_images if ($#recent_images >= $max_recent_images);
1812  shift @recent_sites  if ($#recent_sites  >= $max_recent_sites);
1813}
1814
1815
1816
1817##############################################################################
1818#
1819# other utilities
1820#
1821##############################################################################
1822
1823# Does %-decoding.
1824#
1825sub url_decode {
1826  ($_) = @_;
1827  tr/+/ /;
1828  s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
1829  return $_;
1830}
1831
1832
1833# Given the raw body of a GIF document, returns the dimensions of the image.
1834#
1835sub gif_size {
1836  my ($body) = @_;
1837  my $type = substr($body, 0, 6);
1838  my $s;
1839  return () unless ($type =~ /GIF8[7,9]a/);
1840  $s = substr ($body, 6, 10);
1841  my ($a,$b,$c,$d) = unpack ("C"x4, $s);
1842  return (($b<<8|$a), ($d<<8|$c));
1843}
1844
1845# Given the raw body of a JPEG document, returns the dimensions of the image.
1846#
1847sub jpeg_size {
1848  my ($body) = @_;
1849  my $i = 0;
1850  my $L = length($body);
1851
1852  my $c1 = substr($body, $i, 1); $i++;
1853  my $c2 = substr($body, $i, 1); $i++;
1854  return () unless (ord($c1) == 0xFF && ord($c2) == 0xD8);
1855
1856  my $ch = "0";
1857  while (ord($ch) != 0xDA && $i < $L) {
1858    # Find next marker, beginning with 0xFF.
1859    while (ord($ch) != 0xFF) {
1860      return () if (length($body) <= $i);
1861      $ch = substr($body, $i, 1); $i++;
1862    }
1863    # markers can be padded with any number of 0xFF.
1864    while (ord($ch) == 0xFF) {
1865      return () if (length($body) <= $i);
1866      $ch = substr($body, $i, 1); $i++;
1867    }
1868
1869    # $ch contains the value of the marker.
1870    my $marker = ord($ch);
1871
1872    if (($marker >= 0xC0) &&
1873        ($marker <= 0xCF) &&
1874        ($marker != 0xC4) &&
1875        ($marker != 0xCC)) {  # it's a SOFn marker
1876      $i += 3;
1877      return () if (length($body) <= $i);
1878      my $s = substr($body, $i, 4); $i += 4;
1879      my ($a,$b,$c,$d) = unpack("C"x4, $s);
1880      return (($c<<8|$d), ($a<<8|$b));
1881
1882    } else {
1883      # We must skip variables, since FFs in variable names aren't
1884      # valid JPEG markers.
1885      return () if (length($body) <= $i);
1886      my $s = substr($body, $i, 2); $i += 2;
1887      my ($c1, $c2) = unpack ("C"x2, $s);
1888      my $length = ($c1 << 8) | $c2;
1889      return () if ($length < 2);
1890      $i += $length-2;
1891    }
1892  }
1893  return ();
1894}
1895
1896# Given the raw body of a GIF or JPEG document, returns the dimensions of
1897# the image.
1898#
1899sub image_size {
1900  my ($body) = @_;
1901  my ($w, $h) = gif_size ($body);
1902  if ($w && $h) { return ($w, $h); }
1903  return jpeg_size ($body);
1904}
1905
1906
1907# returns the full path of the named program, or undef.
1908#
1909sub which {
1910  my ($prog) = @_;
1911  foreach (split (/:/, $ENV{PATH})) {
1912    if (-x "$_/$prog") {
1913      return $prog;
1914    }
1915  }
1916  return undef;
1917}
1918
1919
1920# Like rand(), but chooses numbers with a bell curve distribution.
1921sub bellrand {
1922  ($_) = @_;
1923  $_ = 1.0 unless defined($_);
1924  $_ /= 3.0;
1925  return (rand($_) + rand($_) + rand($_));
1926}
1927
1928
1929sub signal_cleanup {
1930  my ($sig) = @_;
1931  print STDERR blurb() . (defined($sig)
1932                          ? "caught signal $sig."
1933                          : "exiting.")
1934                       . "\n"
1935    if ($verbose_exec);
1936
1937  x_cleanup();
1938
1939  if (@pids_to_kill) {
1940    print STDERR blurb() . "killing: " . join(' ', @pids_to_kill) . "\n";
1941    kill ('TERM', @pids_to_kill);
1942  }
1943
1944  exit 1;
1945}
1946
1947##############################################################################
1948#
1949# Generating a list of urls only
1950#
1951##############################################################################
1952
1953sub url_only_output {
1954  do {
1955    my ($base, $img) = pick_image;
1956    if ($img) {
1957      $base =~ s/ /%20/g;
1958      $img  =~ s/ /%20/g;
1959      print "$img $base\n";
1960    }
1961  } while (1);
1962}
1963
1964##############################################################################
1965#
1966# Running as an xscreensaver module
1967#
1968##############################################################################
1969
1970my $image_ppm   = ($ENV{TMPDIR} ? $ENV{TMPDIR} : "/tmp") . "/webcollage." . $$;
1971my $image_tmp1  = $image_ppm . "-1";
1972my $image_tmp2  = $image_ppm . "-2";
1973
1974my $filter_cmd = undef;
1975my $post_filter_cmd = undef;
1976my $background = undef;
1977
1978my $img_width;            # size of the image being generated.
1979my $img_height;
1980
1981my $delay = 2;
1982
1983sub x_cleanup {
1984  unlink $image_ppm, $image_tmp1, $image_tmp2;
1985}
1986
1987
1988# Like system, but prints status about exit codes, and kills this process
1989# with whatever signal killed the sub-process, if any.
1990#
1991sub nontrapping_system {
1992  $! = 0;
1993
1994  $_ = join(" ", @_);
1995  s/\"[^\"]+\"/\"...\"/g;
1996
1997  LOG ($verbose_exec, "executing \"$_\"");
1998
1999  my $rc = system @_;
2000
2001  if ($rc == 0) {
2002    LOG ($verbose_exec, "subproc exited normally.");
2003  } elsif (($rc & 0xff) == 0) {
2004    $rc >>= 8;
2005    LOG ($verbose_exec, "subproc exited with status $rc.");
2006  } else {
2007    if ($rc & 0x80) {
2008      LOG ($verbose_exec, "subproc dumped core.");
2009      $rc &= ~0x80;
2010    }
2011    LOG ($verbose_exec, "subproc died with signal $rc.");
2012    # die that way ourselves.
2013    kill $rc, $$;
2014  }
2015
2016  return $rc;
2017}
2018
2019
2020# Given the URL of a GIF or JPEG image, and the body of that image, writes a
2021# PPM to the given output file.  Returns the width/height of the image if
2022# successful.
2023#
2024sub image_to_pnm {
2025  my ($url, $body, $output) = @_;
2026  my ($cmd, $cmd2, $w, $h);
2027
2028  if ((@_ = gif_size ($body))) {
2029    ($w, $h) = @_;
2030    $cmd = "giftopnm";
2031  } elsif ((@_ = jpeg_size ($body))) {
2032    ($w, $h) = @_;
2033    $cmd = "djpeg";
2034  } else {
2035    LOG (($verbose_pbm || $verbose_load),
2036         "not a GIF or JPG" .
2037         (($body =~ m@<(base|html|head|body|script|table|a href)>@i)
2038          ? " (looks like HTML)" : "") .
2039         ": $url");
2040    $suppress_audit = 1;
2041    return ();
2042  }
2043
2044  $cmd2 = "exec $cmd";        # yes, this really is necessary.  if we don't
2045                              # do this, the process doesn't die properly.
2046  if (!$verbose_pbm) {
2047    #
2048    # We get a "giftopnm: got a 'Application Extension' extension"
2049    # warning any time it's an animgif.
2050    #
2051    # Note that "giftopnm: EOF / read error on image data" is not
2052    # always a fatal error -- sometimes the image looks fine anyway.
2053    #
2054    $cmd2 .= " 2>/dev/null";
2055  }
2056
2057  # There exist corrupted GIF and JPEG files that can make giftopnm and
2058  # djpeg lose their minds and go into a loop.  So this gives those programs
2059  # a small timeout -- if they don't complete in time, kill them.
2060  #
2061  my $pid;
2062  @_ = eval {
2063    my $timed_out;
2064
2065    local $SIG{ALRM}  = sub {
2066      LOG ($verbose_pbm,
2067           "timed out ($cvt_timeout) for $cmd on \"$url\" in pid $pid");
2068      kill ('TERM', $pid) if ($pid);
2069      $timed_out = 1;
2070      $body = undef;
2071    };
2072
2073    if (($pid = open(PIPE, "| $cmd2 > $output"))) {
2074      $timed_out = 0;
2075      alarm $cvt_timeout;
2076      print PIPE $body;
2077      $body = undef;
2078      close PIPE;
2079
2080      LOG ($verbose_exec, "awaiting $pid");
2081      waitpid ($pid, 0);
2082      LOG ($verbose_exec, "$pid completed");
2083
2084      my $size = (stat($output))[7];
2085      $size = -1 unless defined($size);
2086      if ($size < 5) {
2087        LOG ($verbose_pbm, "$cmd on ${w}x$h \"$url\" failed ($size bytes)");
2088        return ();
2089      }
2090
2091      LOG ($verbose_pbm, "created ${w}x$h $output ($cmd)");
2092      return ($w, $h);
2093    } else {
2094      print STDERR blurb() . "$cmd failed: $!\n";
2095      return ();
2096    }
2097  };
2098  die if ($@ && $@ ne "alarm\n");       # propagate errors
2099  if ($@) {
2100    # timed out
2101    $body = undef;
2102    return ();
2103  } else {
2104    # didn't
2105    alarm 0;
2106    $body = undef;
2107    return @_;
2108  }
2109}
2110
2111sub pick_root_displayer {
2112  my @names = ();
2113
2114  foreach my $cmd (@root_displayers) {
2115    $_ = $cmd;
2116    my ($name) = m/^([^ ]+)/;
2117    push @names, "\"$name\"";
2118    LOG ($verbose_exec, "looking for $name...");
2119    foreach my $dir (split (/:/, $ENV{PATH})) {
2120      LOG ($verbose_exec, "  checking $dir/$name");
2121      return $cmd if (-x "$dir/$name");
2122    }
2123  }
2124
2125  $names[$#names] = "or " . $names[$#names];
2126  error "none of: " . join (", ", @names) . " were found on \$PATH.";
2127}
2128
2129
2130my $ppm_to_root_window_cmd = undef;
2131
2132
2133sub x_or_pbm_output {
2134
2135  # Check for our helper program, to see whether we need to use PPM pipelines.
2136  #
2137  $_ = "webcollage-helper";
2138  if (defined ($webcollage_helper) || which ($_)) {
2139    $webcollage_helper = $_ unless (defined($webcollage_helper));
2140    LOG ($verbose_pbm, "found \"$webcollage_helper\"");
2141    $webcollage_helper .= " -v";
2142  } else {
2143    LOG (($verbose_pbm || $verbose_load), "no $_ program");
2144  }
2145
2146  # make sure the various programs we execute exist, right up front.
2147  #
2148  my @progs = ("ppmmake");  # always need this one
2149
2150  if (!defined($webcollage_helper)) {
2151    # Only need these others if we don't have the helper.
2152    @progs = (@progs, "giftopnm", "djpeg", "pnmpaste", "pnmscale", "pnmcut");
2153  }
2154
2155  foreach (@progs) {
2156    which ($_) || error "$_ not found on \$PATH.";
2157  }
2158
2159  # find a root-window displayer program.
2160  #
2161  $ppm_to_root_window_cmd = pick_root_displayer();
2162
2163  if (!$img_width || !$img_height) {
2164    $_ = "xdpyinfo";
2165    which ($_) || error "$_ not found on \$PATH.";
2166    $_ = `$_`;
2167    ($img_width, $img_height) = m/dimensions: *(\d+)x(\d+) /;
2168    if (!defined($img_height)) {
2169      error "xdpyinfo failed.";
2170    }
2171  }
2172
2173  my $bgcolor = "#000000";
2174  my $bgimage = undef;
2175
2176  if ($background) {
2177    if ($background =~ m/^\#[0-9a-f]+$/i) {
2178      $bgcolor = $background;
2179
2180    } elsif (-r $background) {
2181      $bgimage = $background;
2182
2183    } elsif (! $background =~ m@^[-a-z0-9 ]+$@i) {
2184      error "not a color or readable file: $background";
2185
2186    } else {
2187      # default to assuming it's a color
2188      $bgcolor = $background;
2189    }
2190  }
2191
2192  # Create the sold-colored base image.
2193  #
2194  $_ = "ppmmake '$bgcolor' $img_width $img_height";
2195  LOG ($verbose_pbm, "creating base image: $_");
2196  nontrapping_system "$_ > $image_ppm";
2197
2198  # Paste the default background image in the middle of it.
2199  #
2200  if ($bgimage) {
2201    my ($iw, $ih);
2202
2203    my $body = "";
2204    local *IMG;
2205    open(IMG, "<$bgimage") || error "couldn't open $bgimage: $!";
2206    my $cmd;
2207    while (<IMG>) { $body .= $_; }
2208    close (IMG);
2209
2210    if ((@_ = gif_size ($body))) {
2211      ($iw, $ih) = @_;
2212      $cmd = "giftopnm |";
2213
2214    } elsif ((@_ = jpeg_size ($body))) {
2215      ($iw, $ih) = @_;
2216      $cmd = "djpeg |";
2217
2218    } elsif ($body =~ m/^P\d\n(\d+) (\d+)\n/) {
2219      $iw = $1;
2220      $ih = $2;
2221      $cmd = "";
2222
2223    } else {
2224      error "$bgimage is not a GIF, JPEG, or PPM.";
2225    }
2226
2227    my $x = int (($img_width  - $iw) / 2);
2228    my $y = int (($img_height - $ih) / 2);
2229    LOG ($verbose_pbm,
2230         "pasting $bgimage (${iw}x$ih) into base image at $x,$y");
2231
2232    $cmd .= "pnmpaste - $x $y $image_ppm > $image_tmp1";
2233    open (IMG, "| $cmd") || error "running $cmd: $!";
2234    print IMG $body;
2235    $body = undef;
2236    close (IMG);
2237    LOG ($verbose_exec, "subproc exited normally.");
2238    rename ($image_tmp1, $image_ppm) ||
2239      error "renaming $image_tmp1 to $image_ppm: $!";
2240  }
2241
2242  clearlog();
2243
2244  while (1) {
2245    my ($base, $img) = pick_image();
2246    my $source = $current_state;
2247    $current_state = "loadimage";
2248    if ($img) {
2249      my ($headers, $body) = get_document ($img, $base);
2250      if ($body) {
2251        paste_image ($base, $img, $body, $source);
2252        $body = undef;
2253      }
2254    }
2255    $current_state = "idle";
2256    $load_method = "none";
2257
2258    unlink $image_tmp1, $image_tmp2;
2259    sleep $delay;
2260  }
2261}
2262
2263sub paste_image {
2264  my ($base, $img, $body, $source) = @_;
2265
2266  $current_state = "paste";
2267
2268  $suppress_audit = 0;
2269
2270  LOG ($verbose_pbm, "got $img (" . length($body) . ")");
2271
2272  my ($iw, $ih);
2273
2274  # If we are using the webcollage-helper, then we do not need to convert this
2275  # image to a PPM.  But, if we're using a filter command, we still must, since
2276  # that's what the filters expect (webcollage-helper can read PPMs, so that's
2277  # fine.)
2278  #
2279  if (defined ($webcollage_helper) &&
2280      !defined ($filter_cmd)) {
2281
2282    ($iw, $ih) = image_size ($body);
2283    if (!$iw || !$ih) {
2284      LOG (($verbose_pbm || $verbose_load),
2285           "not a GIF or JPG" .
2286           (($body =~ m@<(base|html|head|body|script|table|a href)>@i)
2287            ? " (looks like HTML)" : "") .
2288           ": $img");
2289      $suppress_audit = 1;
2290      $body = undef;
2291      return 0;
2292    }
2293
2294    local *OUT;
2295    open (OUT, ">$image_tmp1") || error ("writing $image_tmp1: $!");
2296    print OUT $body || error ("writing $image_tmp1: $!");
2297    close OUT || error ("writing $image_tmp1: $!");
2298
2299  } else {
2300    ($iw, $ih) = image_to_pnm ($img, $body, $image_tmp1);
2301    $body = undef;
2302    if (!$iw || !$ih) {
2303      LOG ($verbose_pbm, "unable to make PBM from $img");
2304      return 0;
2305    }
2306  }
2307
2308  record_success ($load_method, $img, $base);
2309
2310
2311  my $ow = $iw;  # used only for error messages
2312  my $oh = $ih;
2313
2314  # don't just tack this onto the front of the pipeline -- we want it to
2315  # be able to change the size of the input image.
2316  #
2317  if ($filter_cmd) {
2318    LOG ($verbose_pbm, "running $filter_cmd");
2319
2320    my $rc = nontrapping_system "($filter_cmd) < $image_tmp1 >$image_tmp2";
2321    if ($rc != 0) {
2322      LOG(($verbose_pbm || $verbose_load), "failed command: \"$filter_cmd\"");
2323      LOG(($verbose_pbm || $verbose_load), "failed URL: \"$img\" (${ow}x$oh)");
2324      return;
2325    }
2326    rename ($image_tmp2, $image_tmp1);
2327
2328    # re-get the width/height in case the filter resized it.
2329    local *IMG;
2330    open(IMG, "<$image_tmp1") || return 0;
2331    $_ = <IMG>;
2332    $_ = <IMG>;
2333    ($iw, $ih) = m/^(\d+) (\d+)$/;
2334    close (IMG);
2335    return 0 unless ($iw && $ih);
2336  }
2337
2338  my $target_w = $img_width;
2339  my $target_h = $img_height;
2340
2341  my $cmd = "";
2342  my $scale = 1.0;
2343
2344
2345  # Usually scale the image to fit on the screen -- but sometimes scale it
2346  # to fit on half or a quarter of the screen.  Note that we don't merely
2347  # scale it to fit, we instead cut it in half until it fits -- that should
2348  # give a wider distribution of sizes.
2349  #
2350  if (rand() < 0.3) { $target_w /= 2; $target_h /= 2; $scale /= 2; }
2351  if (rand() < 0.3) { $target_w /= 2; $target_h /= 2; $scale /= 2; }
2352
2353  if ($iw > $target_w || $ih > $target_h) {
2354    while ($iw > $target_w ||
2355           $ih > $target_h) {
2356      $iw = int($iw / 2);
2357      $ih = int($ih / 2);
2358    }
2359    if ($iw <= 10 || $ih <= 10) {
2360      LOG ($verbose_pbm, "scaling to ${iw}x$ih would have been bogus.");
2361      return 0;
2362    }
2363
2364    LOG ($verbose_pbm, "scaling to ${iw}x$ih");
2365
2366    $cmd .= " | pnmscale -xsize $iw -ysize $ih";
2367  }
2368
2369
2370  my $src = $image_tmp1;
2371
2372  my $crop_x = 0;     # the sub-rectangle of the image
2373  my $crop_y = 0;     # that we will actually paste.
2374  my $crop_w = $iw;
2375  my $crop_h = $ih;
2376
2377  # The chance that we will randomly crop out a section of an image starts
2378  # out fairly low, but goes up for images that are very large, or images
2379  # that have ratios that make them look like banners (we try to avoid
2380  # banner images entirely, but they slip through when the IMG tags didn't
2381  # have WIDTH and HEIGHT specified.)
2382  #
2383  my $crop_chance = 0.2;
2384  if ($iw > $img_width * 0.4 || $ih > $img_height * 0.4) {
2385    $crop_chance += 0.2;
2386  }
2387  if ($iw > $img_width * 0.7 || $ih > $img_height * 0.7) {
2388    $crop_chance += 0.2;
2389  }
2390  if ($min_ratio && ($iw * $min_ratio) > $ih) {
2391    $crop_chance += 0.7;
2392  }
2393
2394  if ($crop_chance > 0.1) {
2395    LOG ($verbose_pbm, "crop chance: $crop_chance");
2396  }
2397
2398  if (rand() < $crop_chance) {
2399
2400    my $ow = $crop_w;
2401    my $oh = $crop_h;
2402
2403    if ($crop_w > $min_width) {
2404      # if it's a banner, select the width linearly.
2405      # otherwise, select a bell.
2406      my $r = (($min_ratio && ($iw * $min_ratio) > $ih)
2407               ? rand()
2408               : bellrand());
2409      $crop_w = $min_width + int ($r * ($crop_w - $min_width));
2410      $crop_x = int (rand() * ($ow - $crop_w));
2411    }
2412    if ($crop_h > $min_height) {
2413      # height always selects as a bell.
2414      $crop_h = $min_height + int (bellrand() * ($crop_h - $min_height));
2415      $crop_y = int (rand() * ($oh - $crop_h));
2416    }
2417
2418    if ($crop_x != 0   || $crop_y != 0 ||
2419        $crop_w != $iw || $crop_h != $ih) {
2420      LOG ($verbose_pbm,
2421           "randomly cropping to ${crop_w}x$crop_h \@ $crop_x,$crop_y");
2422    }
2423  }
2424
2425  # Where the image should logically land -- this might be negative.
2426  #
2427  my $x = int((rand() * ($img_width  + $crop_w/2)) - $crop_w*3/4);
2428  my $y = int((rand() * ($img_height + $crop_h/2)) - $crop_h*3/4);
2429
2430  # if we have chosen to paste the image outside of the rectangle of the
2431  # screen, then we need to crop it.
2432  #
2433  if ($x < 0 ||
2434      $y < 0 ||
2435      $x + $crop_w > $img_width ||
2436      $y + $crop_h > $img_height) {
2437
2438    LOG ($verbose_pbm,
2439         "cropping for effective paste of ${crop_w}x$crop_h \@ $x,$y");
2440
2441    if ($x < 0) { $crop_x -= $x; $crop_w += $x; $x = 0; }
2442    if ($y < 0) { $crop_y -= $y; $crop_h += $y; $y = 0; }
2443
2444    if ($x + $crop_w >= $img_width)  { $crop_w = $img_width  - $x - 1; }
2445    if ($y + $crop_h >= $img_height) { $crop_h = $img_height - $y - 1; }
2446  }
2447
2448  # If any cropping needs to happen, add pnmcut.
2449  #
2450  if ($crop_x != 0   || $crop_y != 0 ||
2451        $crop_w != $iw || $crop_h != $ih) {
2452    $iw = $crop_w;
2453    $ih = $crop_h;
2454    $cmd .= " | pnmcut $crop_x $crop_y $iw $ih";
2455    LOG ($verbose_pbm, "cropping to ${crop_w}x$crop_h \@ $crop_x,$crop_y");
2456  }
2457
2458  LOG ($verbose_pbm, "pasting ${iw}x$ih \@ $x,$y in $image_ppm");
2459
2460  $cmd .= " | pnmpaste - $x $y $image_ppm";
2461
2462  $cmd =~ s@^ *\| *@@;
2463
2464  if (defined ($webcollage_helper)) {
2465    $cmd = "$webcollage_helper $image_tmp1 $image_ppm " .
2466                              "$scale $opacity " .
2467                              "$crop_x $crop_y $x $y " .
2468                              "$iw $ih";
2469    $_ = $cmd;
2470
2471  } else {
2472    # use a PPM pipeline
2473    $_ = "($cmd)";
2474    $_ .= " < $image_tmp1 > $image_tmp2";
2475  }
2476
2477  if ($verbose_pbm) {
2478    $_ = "($_) 2>&1 | sed s'/^/" . blurb() . "/'";
2479  } else {
2480    $_ .= " 2> /dev/null";
2481  }
2482
2483  my $rc = nontrapping_system ($_);
2484
2485  if (defined ($webcollage_helper) && -z $image_ppm) {
2486    LOG (1, "failed command: \"$cmd\"");
2487    print STDERR "\naudit log:\n\n\n";
2488    print STDERR ("#" x 78) . "\n";
2489    print STDERR blurb() . "$image_ppm has zero size\n";
2490    showlog();
2491    print STDERR "\n\n";
2492    exit (1);
2493  }
2494
2495  if ($rc != 0) {
2496    LOG (($verbose_pbm || $verbose_load), "failed command: \"$cmd\"");
2497    LOG (($verbose_pbm || $verbose_load), "failed URL: \"$img\" (${ow}x$oh)");
2498    return;
2499  }
2500
2501  if (!defined ($webcollage_helper)) {
2502    rename ($image_tmp2, $image_ppm) || return;
2503  }
2504
2505  my $target = "$image_ppm";
2506
2507  # don't just tack this onto the end of the pipeline -- we don't want it
2508  # to end up in $image_ppm, because we don't want the results to be
2509  # cumulative.
2510  #
2511  if ($post_filter_cmd) {
2512
2513    my $cmd;
2514
2515    $target = $image_tmp1;
2516    if (!defined ($webcollage_helper)) {
2517      $cmd = "($post_filter_cmd) < $image_ppm > $target";
2518    } else {
2519      # Blah, my scripts need the JPEG data, but some other folks need
2520      # the PPM data -- what to do?  Ignore the problem, that's what!
2521#     $cmd = "djpeg < $image_ppm | ($post_filter_cmd) > $target";
2522      $cmd = "($post_filter_cmd) < $image_ppm > $target";
2523    }
2524
2525    $rc = nontrapping_system ($cmd);
2526    if ($rc != 0) {
2527      LOG ($verbose_pbm, "filter failed: \"$post_filter_cmd\"\n");
2528      return;
2529    }
2530  }
2531
2532  if (!$no_output_p) {
2533    my $tsize = (stat($target))[7];
2534    if ($tsize > 200) {
2535      $cmd = "$ppm_to_root_window_cmd $target";
2536
2537      # xv seems to hate being killed.  it tends to forget to clean
2538      # up after itself, and leaves windows around and colors allocated.
2539      # I had this same problem with vidwhacker, and I'm not entirely
2540      # sure what I did to fix it.  But, let's try this: launch xv
2541      # in the background, so that killing this process doesn't kill it.
2542      # it will die of its own accord soon enough.  So this means we
2543      # start pumping bits to the root window in parallel with starting
2544      # the next network retrieval, which is probably a better thing
2545      # to do anyway.
2546      #
2547      $cmd .= " &";
2548
2549      $rc = nontrapping_system ($cmd);
2550
2551      if ($rc != 0) {
2552        LOG (($verbose_pbm || $verbose_load), "display failed: \"$cmd\"");
2553        return;
2554      }
2555
2556    } else {
2557      LOG ($verbose_pbm, "$target size is $tsize");
2558    }
2559  }
2560
2561  $source .= "-" . stats_of($source);
2562  print STDOUT "image: ${iw}x${ih} @ $x,$y $base $source\n"
2563    if ($verbose_imgmap);
2564
2565  clearlog();
2566
2567  return 1;
2568}
2569
2570
2571sub init_signals {
2572
2573  $SIG{HUP}  = \&signal_cleanup;
2574  $SIG{INT}  = \&signal_cleanup;
2575  $SIG{QUIT} = \&signal_cleanup;
2576  $SIG{ABRT} = \&signal_cleanup;
2577  $SIG{KILL} = \&signal_cleanup;
2578  $SIG{TERM} = \&signal_cleanup;
2579
2580  # Need this so that if giftopnm dies, we don't die.
2581  $SIG{PIPE} = 'IGNORE';
2582}
2583
2584END { signal_cleanup(); }
2585
2586
2587sub main {
2588  $| = 1;
2589  srand(time ^ $$);
2590
2591  my $verbose = 0;
2592  my $dict;
2593  my $driftnet_cmd = 0;
2594
2595  $current_state = "init";
2596  $load_method = "none";
2597
2598  my $root_p = 0;
2599
2600  # historical suckage: the environment variable name is lower case.
2601  $http_proxy = $ENV{http_proxy} || $ENV{HTTP_PROXY};
2602
2603  while ($_ = $ARGV[0]) {
2604    shift @ARGV;
2605    if ($_ eq "-display" ||
2606        $_ eq "-displ" ||
2607        $_ eq "-disp" ||
2608        $_ eq "-dis" ||
2609        $_ eq "-dpy" ||
2610        $_ eq "-d") {
2611      $ENV{DISPLAY} = shift @ARGV;
2612    } elsif ($_ eq "-root") {
2613      $root_p = 1;
2614    } elsif ($_ eq "-no-output") {
2615      $no_output_p = 1;
2616    } elsif ($_ eq "-urls-only") {
2617      $urls_only_p = 1;
2618      $no_output_p = 1;
2619    } elsif ($_ eq "-verbose") {
2620      $verbose++;
2621    } elsif (m/^-v+$/) {
2622      $verbose += length($_)-1;
2623    } elsif ($_ eq "-delay") {
2624      $delay = shift @ARGV;
2625    } elsif ($_ eq "-timeout") {
2626      $http_timeout = shift @ARGV;
2627    } elsif ($_ eq "-filter") {
2628      $filter_cmd = shift @ARGV;
2629    } elsif ($_ eq "-filter2") {
2630      $post_filter_cmd = shift @ARGV;
2631    } elsif ($_ eq "-background" || $_ eq "-bg") {
2632      $background = shift @ARGV;
2633    } elsif ($_ eq "-size") {
2634      $_ = shift @ARGV;
2635      if (m@^(\d+)x(\d+)$@) {
2636        $img_width = $1;
2637        $img_height = $2;
2638      } else {
2639        error "argument to \"-size\" must be of the form \"640x400\"";
2640      }
2641    } elsif ($_ eq "-proxy" || $_ eq "-http-proxy") {
2642      $http_proxy = shift @ARGV;
2643    } elsif ($_ eq "-dictionary" || $_ eq "-dict") {
2644      $dict = shift @ARGV;
2645    } elsif ($_ eq "-driftnet" || $_ eq "--driftnet") {
2646      @search_methods = ( 100, "driftnet", \&pick_from_driftnet );
2647      if (! ($ARGV[0] =~ m/^-/)) {
2648        $driftnet_cmd = shift @ARGV;
2649      } else {
2650        $driftnet_cmd = $default_driftnet_cmd;
2651      }
2652    } elsif ($_ eq "-debug" || $_ eq "--debug") {
2653      my $which = shift @ARGV;
2654      my @rest = @search_methods;
2655      my $ok = 0;
2656      while (@rest) {
2657        my $pct  = shift @rest;
2658        my $name = shift @rest;
2659        my $tfn  = shift @rest;
2660
2661        if ($name eq $which) {
2662          @search_methods = (100, $name, $tfn);
2663          $ok = 1;
2664          last;
2665        }
2666      }
2667      error "no such search method as \"$which\"" unless ($ok);
2668      LOG (1, "DEBUG: using only \"$which\"");
2669
2670    } else {
2671      print STDERR "$copyright\nusage: $progname " .
2672              "[-root] [-display dpy] [-verbose] [-debug which]\n" .
2673        "\t\t  [-timeout secs] [-delay secs] [-filter cmd] [-filter2 cmd]\n" .
2674        "\t\t  [-no-output] [-urls-only] [-background color] [-size WxH]\n" .
2675        "\t\t  [-dictionary dictionary-file] [-http-proxy host[:port]]\n" .
2676        "\t\t  [-driftnet [driftnet-program-and-args]]\n" .
2677        "\n";
2678      exit 1;
2679    }
2680  }
2681
2682  if ($http_proxy && $http_proxy eq "") {
2683    $http_proxy = undef;
2684  }
2685  if ($http_proxy && $http_proxy =~ m@^http://([^/]*)/?$@ ) {
2686    # historical suckage: allow "http://host:port" as well as "host:port".
2687    $http_proxy = $1;
2688  }
2689
2690  if (!$root_p && !$no_output_p) {
2691    print STDERR $copyright;
2692    error "the -root argument is mandatory (for now.)";
2693  }
2694
2695  if (!$no_output_p && !$ENV{DISPLAY}) {
2696    error "\$DISPLAY is not set.";
2697  }
2698
2699
2700  if ($verbose == 1) {
2701    $verbose_imgmap   = 1;
2702    $verbose_warnings = 1;
2703
2704  } elsif ($verbose == 2) {
2705    $verbose_imgmap   = 1;
2706    $verbose_warnings = 1;
2707    $verbose_load     = 1;
2708
2709  } elsif ($verbose == 3) {
2710    $verbose_imgmap   = 1;
2711    $verbose_warnings = 1;
2712    $verbose_load     = 1;
2713    $verbose_filter   = 1;
2714
2715  } elsif ($verbose == 4) {
2716    $verbose_imgmap   = 1;
2717    $verbose_warnings = 1;
2718    $verbose_load     = 1;
2719    $verbose_filter   = 1;
2720    $verbose_net      = 1;
2721
2722  } elsif ($verbose == 5) {
2723    $verbose_imgmap   = 1;
2724    $verbose_warnings = 1;
2725    $verbose_load     = 1;
2726    $verbose_filter   = 1;
2727    $verbose_net      = 1;
2728    $verbose_pbm      = 1;
2729
2730  } elsif ($verbose == 6) {
2731    $verbose_imgmap   = 1;
2732    $verbose_warnings = 1;
2733    $verbose_load     = 1;
2734    $verbose_filter   = 1;
2735    $verbose_net      = 1;
2736    $verbose_pbm      = 1;
2737    $verbose_http     = 1;
2738
2739  } elsif ($verbose >= 7) {
2740    $verbose_imgmap   = 1;
2741    $verbose_warnings = 1;
2742    $verbose_load     = 1;
2743    $verbose_filter   = 1;
2744    $verbose_net      = 1;
2745    $verbose_pbm      = 1;
2746    $verbose_http     = 1;
2747    $verbose_exec     = 1;
2748  }
2749
2750  if ($dict) {
2751    error ("$dict does not exist") unless (-f $dict);
2752    $wordlist = $dict;
2753  } else {
2754    pick_dictionary();
2755  }
2756
2757  init_signals();
2758
2759  spawn_driftnet ($driftnet_cmd) if ($driftnet_cmd);
2760
2761  if ($urls_only_p) {
2762    url_only_output;
2763  } else {
2764    x_or_pbm_output;
2765  }
2766}
2767
2768main;
2769exit (0);
Note: See TracBrowser for help on using the repository browser.