1 | #!/usr/bin/perl -w |
---|
2 | # |
---|
3 | # webcollage, Copyright (c) 1999-2003 by Jamie Zawinski <jwz@jwz.org> |
---|
4 | # This program decorates the screen with random images from the web. |
---|
5 | # One satisfied customer described it as "a nonstop pop culture brainbath." |
---|
6 | # |
---|
7 | # Permission to use, copy, modify, distribute, and sell this software and its |
---|
8 | # documentation for any purpose is hereby granted without fee, provided that |
---|
9 | # the above copyright notice appear in all copies and that both that |
---|
10 | # copyright notice and this permission notice appear in supporting |
---|
11 | # documentation. No representations are made about the suitability of this |
---|
12 | # software for any purpose. It is provided "as is" without express or |
---|
13 | # implied warranty. |
---|
14 | |
---|
15 | |
---|
16 | # To run this as a display mode with xscreensaver, add this to `programs': |
---|
17 | # |
---|
18 | # webcollage -root |
---|
19 | # webcollage -root -filter 'vidwhacker -stdin -stdout' |
---|
20 | |
---|
21 | |
---|
22 | # If you have the "driftnet" program installed, webcollage can display a |
---|
23 | # collage of images sniffed off your local ethernet, instead of pulled out |
---|
24 | # of search engines: in that way, your screensaver can display the images |
---|
25 | # that your co-workers are downloading! |
---|
26 | # |
---|
27 | # Driftnet is available here: http://www.ex-parrot.com/~chris/driftnet/ |
---|
28 | # Use it like so: |
---|
29 | # |
---|
30 | # webcollage -root -driftnet |
---|
31 | # |
---|
32 | # Driftnet is the Unix implementation of the MacOS "EtherPEG" program. |
---|
33 | |
---|
34 | |
---|
35 | require 5; |
---|
36 | use strict; |
---|
37 | |
---|
38 | # We can't "use diagnostics" here, because that library malfunctions if |
---|
39 | # you signal and catch alarms: it says "Uncaught exception from user code" |
---|
40 | # and exits, even though I damned well AM catching it! |
---|
41 | #use diagnostics; |
---|
42 | |
---|
43 | |
---|
44 | use Socket; |
---|
45 | require Time::Local; |
---|
46 | require POSIX; |
---|
47 | use Fcntl ':flock'; # import LOCK_* constants |
---|
48 | use POSIX qw(strftime); |
---|
49 | |
---|
50 | use bytes; # Larry can take Unicode and shove it up his ass sideways. |
---|
51 | # Perl 5.8.0 causes us to start getting incomprehensible |
---|
52 | # errors about UTF-8 all over the place without this. |
---|
53 | |
---|
54 | |
---|
55 | my $progname = $0; $progname =~ s@.*/@@g; |
---|
56 | my $version = q{ $Revision: 1.1.1.2 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; |
---|
57 | my $copyright = "WebCollage $version, Copyright (c) 1999-2002" . |
---|
58 | " Jamie Zawinski <jwz\@jwz.org>\n" . |
---|
59 | " http://www.jwz.org/xscreensaver/\n"; |
---|
60 | |
---|
61 | |
---|
62 | |
---|
63 | my @search_methods = ( 77, "altavista", \&pick_from_alta_vista_random_link, |
---|
64 | 14, "yahoorand", \&pick_from_yahoo_random_link, |
---|
65 | 9, "yahoonews", \&pick_from_yahoo_news_text, |
---|
66 | |
---|
67 | # Alta Vista has a new "random link" URL now. |
---|
68 | # They added it specifically to better support webcollage! |
---|
69 | # That was super cool of them. This is how we used to do |
---|
70 | # it, before: |
---|
71 | # |
---|
72 | # 0, "avimages", \&pick_from_alta_vista_images, |
---|
73 | # 0, "avtext", \&pick_from_alta_vista_text, |
---|
74 | |
---|
75 | # Google asked (nicely) for me to stop searching them. |
---|
76 | # I asked them to add a "random link" url. They said |
---|
77 | # "that would be easy, we'll think about it" and then |
---|
78 | # never wrote back. Booo Google! Booooo! |
---|
79 | # |
---|
80 | # 0, "googlenums", \&pick_from_google_image_numbers, |
---|
81 | # 0, "googleimgs", \&pick_from_google_images, |
---|
82 | |
---|
83 | # I suspect Hotbot is actually the same search engine |
---|
84 | # data as Lycos. |
---|
85 | # |
---|
86 | # 0, "hotbot", \&pick_from_hotbot_text, |
---|
87 | |
---|
88 | # Eh, Lycos sucks anyway. |
---|
89 | # 0, "lycos", \&pick_from_lycos_text, |
---|
90 | ); |
---|
91 | |
---|
92 | # programs we can use to write to the root window (tried in ascending order.) |
---|
93 | # |
---|
94 | my @root_displayers = ( |
---|
95 | "xscreensaver-getimage -root -file", |
---|
96 | "chbg -once -xscreensaver -max_size 100", |
---|
97 | "xv -root -quit -viewonly +noresetroot -quick24 -rmode 5" . |
---|
98 | " -rfg black -rbg black", |
---|
99 | "xli -quiet -onroot -center -border black", |
---|
100 | "xloadimage -quiet -onroot -center -border black", |
---|
101 | |
---|
102 | # this lame program wasn't built with vroot.h: |
---|
103 | # "xsri -scale -keep-aspect -center-horizontal -center-vertical", |
---|
104 | ); |
---|
105 | |
---|
106 | |
---|
107 | # Some sites need cookies to work properly. These are they. |
---|
108 | # |
---|
109 | my %cookies = ( |
---|
110 | "www.altavista.com" => "AV_ALL=1", # request uncensored searches |
---|
111 | "web.altavista.com" => "AV_ALL=1", |
---|
112 | |
---|
113 | # log in as "cipherpunk" |
---|
114 | "www.nytimes.com" => 'NYT-S=18cHMIlJOn2Y1bu5xvEG3Ufuk6E1oJ.' . |
---|
115 | 'FMxWaQV0igaB5Yi/Q/guDnLeoL.pe7i1oakSb' . |
---|
116 | '/VqfdUdb2Uo27Vzt1jmPn3cpYRlTw9', |
---|
117 | ); |
---|
118 | |
---|
119 | |
---|
120 | # If this is set, it's a helper program to use for pasting images together: |
---|
121 | # this is a lot faster and more efficient than using PPM pipelines, which is |
---|
122 | # what we do if this program doesn't exist. (We check for "webcollage-helper" |
---|
123 | # on $PATH at startup, and set this variable appropriately.) |
---|
124 | # |
---|
125 | my $webcollage_helper = undef; |
---|
126 | |
---|
127 | |
---|
128 | # If we have the webcollage-helper program, then it will paste the images |
---|
129 | # together with transparency! 0.0 is invisible, 1.0 is totally opaque. |
---|
130 | # |
---|
131 | my $opacity = 0.85; |
---|
132 | |
---|
133 | |
---|
134 | # Some sites have managed to poison the search engines. These are they. |
---|
135 | # (We auto-detect sites that have poisoned the search engines via excessive |
---|
136 | # keywords or dictionary words, but these are ones that slip through |
---|
137 | # anyway.) |
---|
138 | # |
---|
139 | # This can contain full host names, or 2 or 3 component domains. |
---|
140 | # |
---|
141 | my %poisoners = ( |
---|
142 | "die.net" => 1, # 'l33t h4ck3r d00dz. |
---|
143 | "genforum.genealogy.com" => 1, # Cluttering avtext with human names. |
---|
144 | "rootsweb.com" => 1, # Cluttering avtext with human names. |
---|
145 | "akamai.net" => 1, # Lots of sites have their images on Akamai. |
---|
146 | "akamaitech.net" => 1, # But those are pretty much all banners. |
---|
147 | # Since Akamai is super-expensive, let's |
---|
148 | # go out on a limb and assume that all of |
---|
149 | # their customers are rich-and-boring. |
---|
150 | "bartleby.com" => 1, # Dictionary, cluttering avtext. |
---|
151 | "encyclopedia.com" => 1, # Dictionary, cluttering avtext. |
---|
152 | "onlinedictionary.datasegment.com" => 1, # Dictionary, cluttering avtext. |
---|
153 | "hotlinkpics.com" => 1, # Porn site that has poisoned avimages |
---|
154 | # (I don't see how they did it, though!) |
---|
155 | "alwayshotels.com" => 1, # Poisoned Lycos pretty heavily. |
---|
156 | "nextag.com" => 1, # Poisoned Alta Vista real good. |
---|
157 | ); |
---|
158 | |
---|
159 | |
---|
160 | # When verbosity is turned on, we warn about sites that we seem to be hitting |
---|
161 | # a lot: usually this means some new poisoner has made it into the search |
---|
162 | # engines. But sometimes, the warning is just because that site has a lot |
---|
163 | # of stuff on it. So these are the sites that are immune to the "frequent |
---|
164 | # site" diagnostic message. |
---|
165 | # |
---|
166 | my %warningless_sites = ( |
---|
167 | "home.earthlink.net" => 1, # Lots of home pages here. |
---|
168 | "www.geocities.com" => 1, |
---|
169 | "www.angelfire.com" => 1, |
---|
170 | "members.aol.com" => 1, |
---|
171 | |
---|
172 | "yimg.com" => 1, # This is where dailynews.yahoo.com stores |
---|
173 | "eimg.com" => 1, # its images, so pick_from_yahoo_news_text() |
---|
174 | # hits this every time. |
---|
175 | |
---|
176 | "driftnet" => 1, # builtin... |
---|
177 | ); |
---|
178 | |
---|
179 | |
---|
180 | ############################################################################## |
---|
181 | # |
---|
182 | # Various global flags set by command line parameters, or computed |
---|
183 | # |
---|
184 | ############################################################################## |
---|
185 | |
---|
186 | |
---|
187 | my $current_state = "???"; # for diagnostics |
---|
188 | my $load_method; |
---|
189 | my $last_search; |
---|
190 | my $image_succeeded = -1; |
---|
191 | my $suppress_audit = 0; |
---|
192 | |
---|
193 | my $verbose_imgmap = 0; # print out rectangles and URLs only (stdout) |
---|
194 | my $verbose_warnings = 0; # print out warnings when things go wrong |
---|
195 | my $verbose_load = 0; # diagnostics about loading of URLs |
---|
196 | my $verbose_filter = 0; # diagnostics about page selection/rejection |
---|
197 | my $verbose_net = 0; # diagnostics about network I/O |
---|
198 | my $verbose_pbm = 0; # diagnostics about PBM pipelines |
---|
199 | my $verbose_http = 0; # diagnostics about all HTTP activity |
---|
200 | my $verbose_exec = 0; # diagnostics about executing programs |
---|
201 | |
---|
202 | my $report_performance_interval = 60 * 15; # print some stats every 15 minutes |
---|
203 | |
---|
204 | my $http_proxy = undef; |
---|
205 | my $http_timeout = 30; |
---|
206 | my $cvt_timeout = 10; |
---|
207 | |
---|
208 | my $min_width = 50; |
---|
209 | my $min_height = 50; |
---|
210 | my $min_ratio = 1/5; |
---|
211 | |
---|
212 | my $min_gif_area = (120 * 120); |
---|
213 | |
---|
214 | |
---|
215 | my $no_output_p = 0; |
---|
216 | my $urls_only_p = 0; |
---|
217 | |
---|
218 | my @pids_to_kill = (); # forked pids we should kill when we exit, if any. |
---|
219 | |
---|
220 | my $driftnet_magic = 'driftnet'; |
---|
221 | my $driftnet_dir = undef; |
---|
222 | my $default_driftnet_cmd = "driftnet -a -m 100"; |
---|
223 | |
---|
224 | my $wordlist; |
---|
225 | |
---|
226 | my %rejected_urls; |
---|
227 | my @tripwire_words = ("aberrate", "abode", "amorphous", "antioch", |
---|
228 | "arrhenius", "arteriole", "blanket", "brainchild", |
---|
229 | "burdensome", "carnival", "cherub", "chord", "clever", |
---|
230 | "dedicate", "dilogarithm", "dolan", "dryden", |
---|
231 | "eggplant"); |
---|
232 | |
---|
233 | |
---|
234 | ############################################################################## |
---|
235 | # |
---|
236 | # Retrieving URLs |
---|
237 | # |
---|
238 | ############################################################################## |
---|
239 | |
---|
240 | # returns three values: the HTTP response line; the document headers; |
---|
241 | # and the document body. |
---|
242 | # |
---|
243 | sub get_document_1 { |
---|
244 | my ( $url, $referer, $timeout ) = @_; |
---|
245 | |
---|
246 | if (!defined($timeout)) { $timeout = $http_timeout; } |
---|
247 | if ($timeout > $http_timeout) { $timeout = $http_timeout; } |
---|
248 | |
---|
249 | if ($timeout <= 0) { |
---|
250 | LOG (($verbose_net || $verbose_load), "timed out for $url"); |
---|
251 | return (); |
---|
252 | } |
---|
253 | |
---|
254 | LOG ($verbose_net, "get_document_1 $url " . ($referer ? $referer : "")); |
---|
255 | |
---|
256 | if (! ($url =~ m@^http://@i)) { |
---|
257 | LOG ($verbose_net, "not an HTTP URL: $url"); |
---|
258 | return (); |
---|
259 | } |
---|
260 | |
---|
261 | my ($url_proto, $dummy, $serverstring, $path) = split(/\//, $url, 4); |
---|
262 | $path = "" unless $path; |
---|
263 | |
---|
264 | my ($them,$port) = split(/:/, $serverstring); |
---|
265 | $port = 80 unless $port; |
---|
266 | |
---|
267 | my $them2 = $them; |
---|
268 | my $port2 = $port; |
---|
269 | if ($http_proxy) { |
---|
270 | $serverstring = $http_proxy if $http_proxy; |
---|
271 | $serverstring =~ s@^[a-z]+://@@; |
---|
272 | ($them2,$port2) = split(/:/, $serverstring); |
---|
273 | $port2 = 80 unless $port2; |
---|
274 | } |
---|
275 | |
---|
276 | my ($remote, $iaddr, $paddr, $proto, $line); |
---|
277 | $remote = $them2; |
---|
278 | if ($port2 =~ /\D/) { $port2 = getservbyname($port2, 'tcp') } |
---|
279 | if (!$port2) { |
---|
280 | LOG (($verbose_net || $verbose_load), "unrecognised port in $url"); |
---|
281 | return (); |
---|
282 | } |
---|
283 | $iaddr = inet_aton($remote); |
---|
284 | if (!$iaddr) { |
---|
285 | LOG (($verbose_net || $verbose_load), "host not found: $remote"); |
---|
286 | return (); |
---|
287 | } |
---|
288 | $paddr = sockaddr_in($port2, $iaddr); |
---|
289 | |
---|
290 | |
---|
291 | my $head = ""; |
---|
292 | my $body = ""; |
---|
293 | |
---|
294 | @_ = |
---|
295 | eval { |
---|
296 | local $SIG{ALRM} = sub { |
---|
297 | LOG (($verbose_net || $verbose_load), "timed out ($timeout) for $url"); |
---|
298 | die "alarm\n"; |
---|
299 | }; |
---|
300 | alarm $timeout; |
---|
301 | |
---|
302 | $proto = getprotobyname('tcp'); |
---|
303 | if (!socket(S, PF_INET, SOCK_STREAM, $proto)) { |
---|
304 | LOG (($verbose_net || $verbose_load), "socket: $!"); |
---|
305 | return (); |
---|
306 | } |
---|
307 | if (!connect(S, $paddr)) { |
---|
308 | LOG (($verbose_net || $verbose_load), "connect($serverstring): $!"); |
---|
309 | return (); |
---|
310 | } |
---|
311 | |
---|
312 | select(S); $| = 1; select(STDOUT); |
---|
313 | |
---|
314 | my $cookie = $cookies{$them}; |
---|
315 | |
---|
316 | my $user_agent = "$progname/$version"; |
---|
317 | |
---|
318 | if ($url =~ m@^http://www\.altavista\.com/@ || |
---|
319 | $url =~ m@^http://random\.yahoo\.com/@) { |
---|
320 | # block this, you turkeys. |
---|
321 | $user_agent = "Mozilla/4.76 [en] (X11; U; Linux 2.2.16-22 i686; Nav)"; |
---|
322 | } |
---|
323 | |
---|
324 | my $hdrs = "GET " . ($http_proxy ? $url : "/$path") . " HTTP/1.0\r\n" . |
---|
325 | "Host: $them\r\n" . |
---|
326 | "User-Agent: $user_agent\r\n"; |
---|
327 | if ($referer) { |
---|
328 | $hdrs .= "Referer: $referer\r\n"; |
---|
329 | } |
---|
330 | if ($cookie) { |
---|
331 | my @cc = split(/\r?\n/, $cookie); |
---|
332 | $hdrs .= "Cookie: " . join('; ', @cc) . "\r\n"; |
---|
333 | } |
---|
334 | $hdrs .= "\r\n"; |
---|
335 | |
---|
336 | foreach (split('\r?\n', $hdrs)) { |
---|
337 | LOG ($verbose_http, " ==> $_"); |
---|
338 | } |
---|
339 | print S $hdrs; |
---|
340 | my $http = <S> || ""; |
---|
341 | |
---|
342 | # Kludge: the Yahoo Random Link is now returning as its first |
---|
343 | # line "Status: 301" instead of "HTTP/1.0 301 Found". Fix it... |
---|
344 | # |
---|
345 | $http =~ s@^Status:\s+(\d+)\b@HTTP/1.0 $1@i; |
---|
346 | |
---|
347 | $_ = $http; |
---|
348 | s/[\r\n]+$//s; |
---|
349 | LOG ($verbose_http, " <== $_"); |
---|
350 | |
---|
351 | while (<S>) { |
---|
352 | $head .= $_; |
---|
353 | s/[\r\n]+$//s; |
---|
354 | last if m@^$@; |
---|
355 | LOG ($verbose_http, " <== $_"); |
---|
356 | |
---|
357 | if (m@^Set-cookie:\s*([^;\r\n]+)@i) { |
---|
358 | set_cookie($them, $1) |
---|
359 | } |
---|
360 | } |
---|
361 | |
---|
362 | my $lines = 0; |
---|
363 | while (<S>) { |
---|
364 | $body .= $_; |
---|
365 | $lines++; |
---|
366 | } |
---|
367 | |
---|
368 | LOG ($verbose_http, |
---|
369 | " <== [ body ]: $lines lines, " . length($body) . " bytes"); |
---|
370 | |
---|
371 | close S; |
---|
372 | |
---|
373 | if (!$http) { |
---|
374 | LOG (($verbose_net || $verbose_load), "null response: $url"); |
---|
375 | return (); |
---|
376 | } |
---|
377 | |
---|
378 | return ( $http, $head, $body ); |
---|
379 | }; |
---|
380 | die if ($@ && $@ ne "alarm\n"); # propagate errors |
---|
381 | if ($@) { |
---|
382 | # timed out |
---|
383 | $head = undef; |
---|
384 | $body = undef; |
---|
385 | $suppress_audit = 1; |
---|
386 | return (); |
---|
387 | } else { |
---|
388 | # didn't |
---|
389 | alarm 0; |
---|
390 | return @_; |
---|
391 | } |
---|
392 | } |
---|
393 | |
---|
394 | |
---|
395 | # returns two values: the document headers; and the document body. |
---|
396 | # if the given URL did a redirect, returns the redirected-to document. |
---|
397 | # |
---|
398 | sub get_document { |
---|
399 | my ( $url, $referer, $timeout ) = @_; |
---|
400 | my $start = time; |
---|
401 | |
---|
402 | if (defined($referer) && $referer eq $driftnet_magic) { |
---|
403 | return get_driftnet_file ($url); |
---|
404 | } |
---|
405 | |
---|
406 | my $orig_url = $url; |
---|
407 | my $loop_count = 0; |
---|
408 | my $max_loop_count = 4; |
---|
409 | |
---|
410 | do { |
---|
411 | if (defined($timeout) && $timeout <= 0) { |
---|
412 | LOG (($verbose_net || $verbose_load), "timed out for $url"); |
---|
413 | $suppress_audit = 1; |
---|
414 | return (); |
---|
415 | } |
---|
416 | |
---|
417 | my ( $http, $head, $body ) = get_document_1 ($url, $referer, $timeout); |
---|
418 | |
---|
419 | if (defined ($timeout)) { |
---|
420 | my $now = time; |
---|
421 | my $elapsed = $now - $start; |
---|
422 | $timeout -= $elapsed; |
---|
423 | $start = $now; |
---|
424 | } |
---|
425 | |
---|
426 | return () unless $http; # error message already printed |
---|
427 | |
---|
428 | $http =~ s/[\r\n]+$//s; |
---|
429 | |
---|
430 | if ( $http =~ m@^HTTP/[0-9.]+ 30[123]@ ) { |
---|
431 | $_ = $head; |
---|
432 | |
---|
433 | my ( $location ) = m@^location:[ \t]*(.*)$@im; |
---|
434 | if ( $location ) { |
---|
435 | $location =~ s/[\r\n]$//; |
---|
436 | |
---|
437 | LOG ($verbose_net, "redirect from $url to $location"); |
---|
438 | $referer = $url; |
---|
439 | $url = $location; |
---|
440 | |
---|
441 | if ($url =~ m@^/@) { |
---|
442 | $referer =~ m@^(http://[^/]+)@i; |
---|
443 | $url = $1 . $url; |
---|
444 | } elsif (! ($url =~ m@^[a-z]+:@i)) { |
---|
445 | $_ = $referer; |
---|
446 | s@[^/]+$@@g if m@^http://[^/]+/@i; |
---|
447 | $_ .= "/" if m@^http://[^/]+$@i; |
---|
448 | $url = $_ . $url; |
---|
449 | } |
---|
450 | |
---|
451 | } else { |
---|
452 | LOG ($verbose_net, "no Location with \"$http\""); |
---|
453 | return ( $url, $body ); |
---|
454 | } |
---|
455 | |
---|
456 | if ($loop_count++ > $max_loop_count) { |
---|
457 | LOG ($verbose_net, |
---|
458 | "too many redirects ($max_loop_count) from $orig_url"); |
---|
459 | $body = undef; |
---|
460 | return (); |
---|
461 | } |
---|
462 | |
---|
463 | } elsif ( $http =~ m@^HTTP/[0-9.]+ ([4-9][0-9][0-9].*)$@ ) { |
---|
464 | |
---|
465 | LOG (($verbose_net || $verbose_load), "failed: $1 ($url)"); |
---|
466 | |
---|
467 | # http errors -- return nothing. |
---|
468 | $body = undef; |
---|
469 | return (); |
---|
470 | |
---|
471 | } elsif (!$body) { |
---|
472 | |
---|
473 | LOG (($verbose_net || $verbose_load), "document contains no data: $url"); |
---|
474 | return (); |
---|
475 | |
---|
476 | } else { |
---|
477 | |
---|
478 | # ok! |
---|
479 | return ( $url, $body ); |
---|
480 | } |
---|
481 | |
---|
482 | } while (1); |
---|
483 | } |
---|
484 | |
---|
485 | # If we already have a cookie defined for this site, and the site is trying |
---|
486 | # to overwrite that very same cookie, let it do so. This is because nytimes |
---|
487 | # expires its cookies - it lets you upgrade to a new cookie without logging |
---|
488 | # in again, but you have to present the old cookie to get the new cookie. |
---|
489 | # So, by doing this, the built-in cypherpunks cookie will never go "stale". |
---|
490 | # |
---|
491 | sub set_cookie { |
---|
492 | my ($host, $cookie) = @_; |
---|
493 | my $oc = $cookies{$host}; |
---|
494 | return unless $oc; |
---|
495 | $_ = $oc; |
---|
496 | my ($oc_name, $oc_value) = m@^([^= \t\r\n]+)=(.*)$@; |
---|
497 | $_ = $cookie; |
---|
498 | my ($nc_name, $nc_value) = m@^([^= \t\r\n]+)=(.*)$@; |
---|
499 | |
---|
500 | if ($oc_name eq $nc_name && |
---|
501 | $oc_value ne $nc_value) { |
---|
502 | $cookies{$host} = $cookie; |
---|
503 | LOG ($verbose_net, "overwrote ${host}'s $oc_name cookie"); |
---|
504 | } |
---|
505 | } |
---|
506 | |
---|
507 | |
---|
508 | ############################################################################ |
---|
509 | # |
---|
510 | # Extracting image URLs from HTML |
---|
511 | # |
---|
512 | ############################################################################ |
---|
513 | |
---|
514 | # given a URL and the body text at that URL, selects and returns a random |
---|
515 | # image from it. returns () if no suitable images found. |
---|
516 | # |
---|
517 | sub pick_image_from_body { |
---|
518 | my ( $url, $body ) = @_; |
---|
519 | |
---|
520 | my $base = $url; |
---|
521 | $_ = $url; |
---|
522 | |
---|
523 | # if there's at least one slash after the host, take off the last |
---|
524 | # pathname component |
---|
525 | if ( m@^http://[^/]+/@io ) { |
---|
526 | $base =~ s@[^/]+$@@go; |
---|
527 | } |
---|
528 | |
---|
529 | # if there are no slashes after the host at all, put one on the end. |
---|
530 | if ( m@^http://[^/]+$@io ) { |
---|
531 | $base .= "/"; |
---|
532 | } |
---|
533 | |
---|
534 | $_ = $body; |
---|
535 | |
---|
536 | # strip out newlines, compress whitespace |
---|
537 | s/[\r\n\t ]+/ /go; |
---|
538 | |
---|
539 | # nuke comments |
---|
540 | s/<!--.*?-->//go; |
---|
541 | |
---|
542 | |
---|
543 | # There are certain web sites that list huge numbers of dictionary |
---|
544 | # words in their bodies or in their <META NAME=KEYWORDS> tags (surprise! |
---|
545 | # Porn sites tend not to be reputable!) |
---|
546 | # |
---|
547 | # I do not want webcollage to filter on content: I want it to select |
---|
548 | # randomly from the set of images on the web. All the logic here for |
---|
549 | # rejecting some images is really a set of heuristics for rejecting |
---|
550 | # images that are not really images: for rejecting *text* that is in |
---|
551 | # GIF/JPEG form. I don't want text, I want pictures, and I want the |
---|
552 | # content of the pictures to be randomly selected from among all the |
---|
553 | # available content. |
---|
554 | # |
---|
555 | # So, filtering out "dirty" pictures by looking for "dirty" keywords |
---|
556 | # would be wrong: dirty pictures exist, like it or not, so webcollage |
---|
557 | # should be able to select them. |
---|
558 | # |
---|
559 | # However, picking a random URL is a hard thing to do. The mechanism I'm |
---|
560 | # using is to search for a selection of random words. This is not |
---|
561 | # perfect, but works ok most of the time. The way it breaks down is when |
---|
562 | # some URLs get precedence because their pages list *every word* as |
---|
563 | # related -- those URLs come up more often than others. |
---|
564 | # |
---|
565 | # So, after we've retrieved a URL, if it has too many keywords, reject |
---|
566 | # it. We reject it not on the basis of what those keywords are, but on |
---|
567 | # the basis that by having so many, the page has gotten an unfair |
---|
568 | # advantage against our randomizer. |
---|
569 | # |
---|
570 | my $trip_count = 0; |
---|
571 | foreach my $trip (@tripwire_words) { |
---|
572 | $trip_count++ if m/$trip/i; |
---|
573 | } |
---|
574 | |
---|
575 | if ($trip_count >= $#tripwire_words - 2) { |
---|
576 | LOG (($verbose_filter || $verbose_load), |
---|
577 | "there is probably a dictionary in \"$url\": rejecting."); |
---|
578 | $rejected_urls{$url} = -1; |
---|
579 | $body = undef; |
---|
580 | $_ = undef; |
---|
581 | return (); |
---|
582 | } |
---|
583 | |
---|
584 | |
---|
585 | my @urls; |
---|
586 | my %unique_urls; |
---|
587 | |
---|
588 | foreach (split(/ *</)) { |
---|
589 | if ( m/^meta /i ) { |
---|
590 | |
---|
591 | # Likewise, reject any web pages that have a KEYWORDS meta tag |
---|
592 | # that is too long. |
---|
593 | # |
---|
594 | if (m/name ?= ?\"?keywords\"?/i && |
---|
595 | m/content ?= ?\"([^\"]+)\"/) { |
---|
596 | my $L = length($1); |
---|
597 | if ($L > 1000) { |
---|
598 | LOG (($verbose_filter || $verbose_load), |
---|
599 | "excessive keywords ($L bytes) in $url: rejecting."); |
---|
600 | $rejected_urls{$url} = $L; |
---|
601 | $body = undef; |
---|
602 | $_ = undef; |
---|
603 | return (); |
---|
604 | } else { |
---|
605 | LOG ($verbose_filter, " keywords ($L bytes) in $url (ok)"); |
---|
606 | } |
---|
607 | } |
---|
608 | |
---|
609 | } elsif ( m/^(img|a) .*(src|href) ?= ?\"? ?(.*?)[ >\"]/io ) { |
---|
610 | |
---|
611 | my $was_inline = (! ( "$1" eq "a" || "$1" eq "A" )); |
---|
612 | my $link = $3; |
---|
613 | my ( $width ) = m/width ?=[ \"]*(\d+)/oi; |
---|
614 | my ( $height ) = m/height ?=[ \"]*(\d+)/oi; |
---|
615 | $_ = $link; |
---|
616 | |
---|
617 | if ( m@^/@o ) { |
---|
618 | my $site; |
---|
619 | ( $site = $base ) =~ s@^(http://[^/]*).*@$1@gio; |
---|
620 | $_ = "$site$link"; |
---|
621 | } elsif ( ! m@^[^/:?]+:@ ) { |
---|
622 | $_ = "$base$link"; |
---|
623 | s@/\./@/@g; |
---|
624 | 1 while (s@/[^/]+/\.\./@/@g); |
---|
625 | } |
---|
626 | |
---|
627 | # skip non-http |
---|
628 | if ( ! m@^http://@io ) { |
---|
629 | next; |
---|
630 | } |
---|
631 | |
---|
632 | # skip non-image |
---|
633 | if ( ! m@[.](gif|jpg|jpeg|pjpg|pjpeg)$@io ) { |
---|
634 | next; |
---|
635 | } |
---|
636 | |
---|
637 | # skip really short or really narrow images |
---|
638 | if ( $width && $width < $min_width) { |
---|
639 | if (!$height) { $height = "?"; } |
---|
640 | LOG ($verbose_filter, " skip narrow image $_ (${width}x$height)"); |
---|
641 | next; |
---|
642 | } |
---|
643 | |
---|
644 | if ( $height && $height < $min_height) { |
---|
645 | if (!$width) { $width = "?"; } |
---|
646 | LOG ($verbose_filter, " skip short image $_ (${width}x$height)"); |
---|
647 | next; |
---|
648 | } |
---|
649 | |
---|
650 | # skip images with ratios that make them look like banners. |
---|
651 | if ($min_ratio && $width && $height && |
---|
652 | ($width * $min_ratio ) > $height) { |
---|
653 | if (!$height) { $height = "?"; } |
---|
654 | LOG ($verbose_filter, " skip bad ratio $_ (${width}x$height)"); |
---|
655 | next; |
---|
656 | } |
---|
657 | |
---|
658 | # skip GIFs with a small number of pixels -- those usually suck. |
---|
659 | if ($width && $height && |
---|
660 | m/\.gif$/io && |
---|
661 | ($width * $height) < $min_gif_area) { |
---|
662 | LOG ($verbose_filter, " skip small GIF $_ (${width}x$height)"); |
---|
663 | next; |
---|
664 | } |
---|
665 | |
---|
666 | |
---|
667 | my $url = $_; |
---|
668 | |
---|
669 | if ($unique_urls{$url}) { |
---|
670 | LOG ($verbose_filter, " skip duplicate image $_"); |
---|
671 | next; |
---|
672 | } |
---|
673 | |
---|
674 | LOG ($verbose_filter, |
---|
675 | " image $url" . |
---|
676 | ($width && $height ? " (${width}x${height})" : "") . |
---|
677 | ($was_inline ? " (inline)" : "")); |
---|
678 | |
---|
679 | $urls[++$#urls] = $url; |
---|
680 | $unique_urls{$url}++; |
---|
681 | |
---|
682 | # jpegs are preferable to gifs. |
---|
683 | $_ = $url; |
---|
684 | if ( ! m@[.]gif$@io ) { |
---|
685 | $urls[++$#urls] = $url; |
---|
686 | } |
---|
687 | |
---|
688 | # pointers to images are preferable to inlined images. |
---|
689 | if ( ! $was_inline ) { |
---|
690 | $urls[++$#urls] = $url; |
---|
691 | $urls[++$#urls] = $url; |
---|
692 | } |
---|
693 | } |
---|
694 | } |
---|
695 | |
---|
696 | my $fsp = ($body =~ m@<frameset@i); |
---|
697 | |
---|
698 | $_ = undef; |
---|
699 | $body = undef; |
---|
700 | |
---|
701 | @urls = depoison (@urls); |
---|
702 | |
---|
703 | if ( $#urls < 0 ) { |
---|
704 | LOG ($verbose_load, "no images on $base" . ($fsp ? " (frameset)" : "")); |
---|
705 | return (); |
---|
706 | } |
---|
707 | |
---|
708 | # pick a random element of the table |
---|
709 | my $i = int(rand($#urls+1)); |
---|
710 | $url = $urls[$i]; |
---|
711 | |
---|
712 | LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#urls+1) . ": $url"); |
---|
713 | |
---|
714 | return $url; |
---|
715 | } |
---|
716 | |
---|
717 | |
---|
718 | |
---|
719 | ############################################################################ |
---|
720 | # |
---|
721 | # Subroutines for getting pages and images out of search engines |
---|
722 | # |
---|
723 | ############################################################################ |
---|
724 | |
---|
725 | |
---|
726 | sub pick_dictionary { |
---|
727 | my @dicts = ("/usr/dict/words", |
---|
728 | "/usr/share/dict/words", |
---|
729 | "/usr/share/lib/dict/words"); |
---|
730 | foreach my $f (@dicts) { |
---|
731 | if (-f $f) { |
---|
732 | $wordlist = $f; |
---|
733 | last; |
---|
734 | } |
---|
735 | } |
---|
736 | error ("$dicts[0] does not exist") unless defined($wordlist); |
---|
737 | } |
---|
738 | |
---|
739 | # returns a random word from the dictionary |
---|
740 | # |
---|
741 | sub random_word { |
---|
742 | my $word = 0; |
---|
743 | if (open (IN, "<$wordlist")) { |
---|
744 | my $size = (stat(IN))[7]; |
---|
745 | my $pos = rand $size; |
---|
746 | if (seek (IN, $pos, 0)) { |
---|
747 | $word = <IN>; # toss partial line |
---|
748 | $word = <IN>; # keep next line |
---|
749 | } |
---|
750 | if (!$word) { |
---|
751 | seek( IN, 0, 0 ); |
---|
752 | $word = <IN>; |
---|
753 | } |
---|
754 | close (IN); |
---|
755 | } |
---|
756 | |
---|
757 | return 0 if (!$word); |
---|
758 | |
---|
759 | $word =~ s/^[ \t\n\r]+//; |
---|
760 | $word =~ s/[ \t\n\r]+$//; |
---|
761 | $word =~ s/ys$/y/; |
---|
762 | $word =~ s/ally$//; |
---|
763 | $word =~ s/ly$//; |
---|
764 | $word =~ s/ies$/y/; |
---|
765 | $word =~ s/ally$/al/; |
---|
766 | $word =~ s/izes$/ize/; |
---|
767 | $word =~ tr/A-Z/a-z/; |
---|
768 | |
---|
769 | if ( $word =~ s/[ \t\n\r]/\+/g ) { # convert intra-word spaces to "+". |
---|
770 | $word = "\%22$word\%22"; # And put quotes (%22) around it. |
---|
771 | } |
---|
772 | |
---|
773 | return $word; |
---|
774 | } |
---|
775 | |
---|
776 | sub random_words { |
---|
777 | my ($or_p) = @_; |
---|
778 | my $sep = ($or_p ? "%20OR%20" : "%20"); |
---|
779 | return (random_word . $sep . |
---|
780 | random_word . $sep . |
---|
781 | random_word . $sep . |
---|
782 | random_word . $sep . |
---|
783 | random_word); |
---|
784 | } |
---|
785 | |
---|
786 | |
---|
787 | sub url_quote { |
---|
788 | my ($s) = @_; |
---|
789 | $s =~ s|([^-a-zA-Z0-9.\@/_\r\n])|sprintf("%%%02X", ord($1))|ge; |
---|
790 | return $s; |
---|
791 | } |
---|
792 | |
---|
793 | sub url_unquote { |
---|
794 | my ($s) = @_; |
---|
795 | $s =~ s/[+]/ /g; |
---|
796 | $s =~ s/%([a-z0-9]{2})/chr(hex($1))/ige; |
---|
797 | return $s; |
---|
798 | } |
---|
799 | |
---|
800 | |
---|
801 | # Loads the given URL (a search on some search engine) and returns: |
---|
802 | # - the total number of hits the search engine claimed it had; |
---|
803 | # - a list of URLs from the page that the search engine returned; |
---|
804 | # Note that this list contains all kinds of internal search engine |
---|
805 | # junk URLs too -- caller must prune them. |
---|
806 | # |
---|
807 | sub pick_from_search_engine { |
---|
808 | my ( $timeout, $search_url, $words ) = @_; |
---|
809 | |
---|
810 | $_ = $words; |
---|
811 | s/%20/ /g; |
---|
812 | |
---|
813 | print STDERR "\n\n" if ($verbose_load); |
---|
814 | |
---|
815 | LOG ($verbose_load, "words: $_"); |
---|
816 | LOG ($verbose_load, "URL: $search_url"); |
---|
817 | |
---|
818 | $last_search = $search_url; # for warnings |
---|
819 | |
---|
820 | my $start = time; |
---|
821 | my ( $base, $body ) = get_document ($search_url, undef, $timeout); |
---|
822 | if (defined ($timeout)) { |
---|
823 | $timeout -= (time - $start); |
---|
824 | if ($timeout <= 0) { |
---|
825 | $body = undef; |
---|
826 | LOG (($verbose_net || $verbose_load), |
---|
827 | "timed out (late) for $search_url"); |
---|
828 | $suppress_audit = 1; |
---|
829 | return (); |
---|
830 | } |
---|
831 | } |
---|
832 | |
---|
833 | return () if (! $body); |
---|
834 | |
---|
835 | |
---|
836 | my @subpages; |
---|
837 | |
---|
838 | my $search_count = "?"; |
---|
839 | if ($body =~ m@found (approximately |about )?(<B>)?(\d+)(</B>)? image@) { |
---|
840 | $search_count = $3; |
---|
841 | } elsif ($body =~ m@<NOBR>((\d{1,3})(,\d{3})*) @i) { |
---|
842 | $search_count = $1; |
---|
843 | } elsif ($body =~ m@found ((\d{1,3})(,\d{3})*|\d+) Web p@) { |
---|
844 | $search_count = $1; |
---|
845 | } elsif ($body =~ m@found about ((\d{1,3})(,\d{3})*|\d+) results@) { |
---|
846 | $search_count = $1; |
---|
847 | } elsif ($body =~ m@\b\d+ - \d+ of (\d+)\b@i) { # avimages |
---|
848 | $search_count = $1; |
---|
849 | } elsif ($body =~ m@About ((\d{1,3})(,\d{3})*) images@i) { # avimages |
---|
850 | $search_count = $1; |
---|
851 | } elsif ($body =~ m@We found ((\d{1,3})(,\d{3})*|\d+) results@i) { # *vista |
---|
852 | $search_count = $1; |
---|
853 | } elsif ($body =~ m@ of about <B>((\d{1,3})(,\d{3})*)<@i) { # googleimages |
---|
854 | $search_count = $1; |
---|
855 | } elsif ($body =~ m@<B>((\d{1,3})(,\d{3})*)</B> Web sites were found@i) { |
---|
856 | $search_count = $1; # lycos |
---|
857 | } elsif ($body =~ m@WEB.*?RESULTS.*?\b((\d{1,3})(,\d{3})*)\b.*?Matches@i) { |
---|
858 | $search_count = $1; # hotbot |
---|
859 | } elsif ($body =~ m@no photos were found containing@i) { # avimages |
---|
860 | $search_count = "0"; |
---|
861 | } elsif ($body =~ m@found no document matching@i) { # avtext |
---|
862 | $search_count = "0"; |
---|
863 | } |
---|
864 | 1 while ($search_count =~ s/^(\d+)(\d{3})/$1,$2/); |
---|
865 | |
---|
866 | # if ($search_count eq "?" || $search_count eq "0") { |
---|
867 | # local *OUT; |
---|
868 | # my $file = "/tmp/wc.html"; |
---|
869 | # open(OUT, ">$file") || error ("writing $file: $!"); |
---|
870 | # print OUT $body; |
---|
871 | # close OUT; |
---|
872 | # print STDERR blurb() . "###### wrote $file\n"; |
---|
873 | # } |
---|
874 | |
---|
875 | |
---|
876 | my $length = length($body); |
---|
877 | my $href_count = 0; |
---|
878 | |
---|
879 | $_ = $body; |
---|
880 | |
---|
881 | s/[\r\n\t ]+/ /g; |
---|
882 | |
---|
883 | |
---|
884 | s/(<A )/\n$1/gi; |
---|
885 | foreach (split(/\n/)) { |
---|
886 | $href_count++; |
---|
887 | my ($u) = m@<A\s.*\bHREF\s*=\s*([^>]+)>@i; |
---|
888 | next unless $u; |
---|
889 | |
---|
890 | if ($u =~ m/^\"([^\"]*)\"/) { $u = $1; } # quoted string |
---|
891 | elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; } # or token |
---|
892 | |
---|
893 | if ( $rejected_urls{$u} ) { |
---|
894 | LOG ($verbose_filter, " pre-rejecting candidate: $u"); |
---|
895 | next; |
---|
896 | } |
---|
897 | |
---|
898 | LOG ($verbose_http, " HREF: $u"); |
---|
899 | |
---|
900 | $subpages[++$#subpages] = $u; |
---|
901 | } |
---|
902 | |
---|
903 | if ( $#subpages < 0 ) { |
---|
904 | LOG ($verbose_filter, |
---|
905 | "found nothing on $base ($length bytes, $href_count links)."); |
---|
906 | return (); |
---|
907 | } |
---|
908 | |
---|
909 | LOG ($verbose_filter, "" . $#subpages+1 . " links on $search_url"); |
---|
910 | |
---|
911 | return ($search_count, @subpages); |
---|
912 | } |
---|
913 | |
---|
914 | |
---|
915 | sub depoison { |
---|
916 | my (@urls) = @_; |
---|
917 | my @urls2 = (); |
---|
918 | foreach (@urls) { |
---|
919 | my ($h) = m@^http://([^/: \t\r\n]+)@i; |
---|
920 | |
---|
921 | next unless defined($h); |
---|
922 | |
---|
923 | if ($poisoners{$h}) { |
---|
924 | LOG (($verbose_filter), " rejecting poisoner: $_"); |
---|
925 | next; |
---|
926 | } |
---|
927 | if ($h =~ m@([^.]+\.[^.]+\.[^.]+)$@ && |
---|
928 | $poisoners{$1}) { |
---|
929 | LOG (($verbose_filter), " rejecting poisoner: $_"); |
---|
930 | next; |
---|
931 | } |
---|
932 | if ($h =~ m@([^.]+\.[^.]+)$@ && |
---|
933 | $poisoners{$1}) { |
---|
934 | LOG (($verbose_filter), " rejecting poisoner: $_"); |
---|
935 | next; |
---|
936 | } |
---|
937 | |
---|
938 | push @urls2, $_; |
---|
939 | } |
---|
940 | return @urls2; |
---|
941 | } |
---|
942 | |
---|
943 | |
---|
944 | # given a list of URLs, picks one at random; loads it; and returns a |
---|
945 | # random image from it. |
---|
946 | # returns the url of the page loaded; the url of the image chosen; |
---|
947 | # and a debugging description string. |
---|
948 | # |
---|
949 | sub pick_image_from_pages { |
---|
950 | my ($base, $total_hit_count, $unfiltered_link_count, $timeout, @pages) = @_; |
---|
951 | |
---|
952 | $total_hit_count = "?" unless defined($total_hit_count); |
---|
953 | |
---|
954 | @pages = depoison (@pages); |
---|
955 | LOG ($verbose_load, |
---|
956 | "" . ($#pages+1) . " candidates of $unfiltered_link_count links" . |
---|
957 | " ($total_hit_count total)"); |
---|
958 | |
---|
959 | return () if ($#pages < 0); |
---|
960 | |
---|
961 | my $i = int(rand($#pages+1)); |
---|
962 | my $page = $pages[$i]; |
---|
963 | |
---|
964 | LOG ($verbose_load, "picked page $page"); |
---|
965 | |
---|
966 | $suppress_audit = 1; |
---|
967 | |
---|
968 | my ( $base2, $body2 ) = get_document ($page, $base, $timeout); |
---|
969 | |
---|
970 | if (!$base2 || !$body2) { |
---|
971 | $body2 = undef; |
---|
972 | return (); |
---|
973 | } |
---|
974 | |
---|
975 | my $img = pick_image_from_body ($base2, $body2); |
---|
976 | $body2 = undef; |
---|
977 | |
---|
978 | if ($img) { |
---|
979 | return ($base2, $img); |
---|
980 | } else { |
---|
981 | return (); |
---|
982 | } |
---|
983 | } |
---|
984 | |
---|
985 | |
---|
986 | ############################################################################ |
---|
987 | # |
---|
988 | # Pick images from random pages returned by the Yahoo Random Link |
---|
989 | # |
---|
990 | ############################################################################ |
---|
991 | |
---|
992 | # yahoorand |
---|
993 | my $yahoo_random_link = "http://random.yahoo.com/fast/ryl"; |
---|
994 | |
---|
995 | |
---|
996 | # Picks a random page; picks a random image on that page; |
---|
997 | # returns two URLs: the page containing the image, and the image. |
---|
998 | # Returns () if nothing found this time. |
---|
999 | # |
---|
1000 | sub pick_from_yahoo_random_link { |
---|
1001 | my ( $timeout ) = @_; |
---|
1002 | |
---|
1003 | print STDERR "\n\n" if ($verbose_load); |
---|
1004 | LOG ($verbose_load, "URL: $yahoo_random_link"); |
---|
1005 | |
---|
1006 | $last_search = $yahoo_random_link; # for warnings |
---|
1007 | |
---|
1008 | $suppress_audit = 1; |
---|
1009 | |
---|
1010 | my ( $base, $body ) = get_document ($yahoo_random_link, undef, $timeout); |
---|
1011 | if (!$base || !$body) { |
---|
1012 | $body = undef; |
---|
1013 | return; |
---|
1014 | } |
---|
1015 | |
---|
1016 | LOG ($verbose_load, "redirected to: $base"); |
---|
1017 | |
---|
1018 | my $img = pick_image_from_body ($base, $body); |
---|
1019 | $body = undef; |
---|
1020 | |
---|
1021 | if ($img) { |
---|
1022 | return ($base, $img); |
---|
1023 | } else { |
---|
1024 | return (); |
---|
1025 | } |
---|
1026 | } |
---|
1027 | |
---|
1028 | |
---|
1029 | ############################################################################ |
---|
1030 | # |
---|
1031 | # Pick images from random pages returned by the Alta Vista Random Link |
---|
1032 | # |
---|
1033 | ############################################################################ |
---|
1034 | |
---|
1035 | # altavista |
---|
1036 | my $alta_vista_random_link = "http://www.altavista.com/image/randomlink"; |
---|
1037 | |
---|
1038 | |
---|
1039 | # Picks a random page; picks a random image on that page; |
---|
1040 | # returns two URLs: the page containing the image, and the image. |
---|
1041 | # Returns () if nothing found this time. |
---|
1042 | # |
---|
1043 | sub pick_from_alta_vista_random_link { |
---|
1044 | my ( $timeout ) = @_; |
---|
1045 | |
---|
1046 | print STDERR "\n\n" if ($verbose_load); |
---|
1047 | LOG ($verbose_load, "URL: $alta_vista_random_link"); |
---|
1048 | |
---|
1049 | $last_search = $alta_vista_random_link; # for warnings |
---|
1050 | |
---|
1051 | $suppress_audit = 1; |
---|
1052 | |
---|
1053 | my ( $base, $body ) = get_document ($alta_vista_random_link, |
---|
1054 | undef, $timeout); |
---|
1055 | if (!$base || !$body) { |
---|
1056 | $body = undef; |
---|
1057 | return; |
---|
1058 | } |
---|
1059 | |
---|
1060 | LOG ($verbose_load, "redirected to: $base"); |
---|
1061 | |
---|
1062 | my $img = pick_image_from_body ($base, $body); |
---|
1063 | $body = undef; |
---|
1064 | |
---|
1065 | if ($img) { |
---|
1066 | return ($base, $img); |
---|
1067 | } else { |
---|
1068 | return (); |
---|
1069 | } |
---|
1070 | } |
---|
1071 | |
---|
1072 | |
---|
1073 | ############################################################################ |
---|
1074 | # |
---|
1075 | # Pick images by feeding random words into Alta Vista Image Search |
---|
1076 | # |
---|
1077 | ############################################################################ |
---|
1078 | |
---|
1079 | |
---|
1080 | my $alta_vista_images_url = "http://www.altavista.com/image/results" . |
---|
1081 | "?ipht=1" . # photos |
---|
1082 | "&igrph=1" . # graphics |
---|
1083 | "&iclr=1" . # color |
---|
1084 | "&ibw=1" . # b&w |
---|
1085 | "&micat=1" . # no partner sites |
---|
1086 | "&sc=on" . # "site collapse" |
---|
1087 | "&q="; |
---|
1088 | |
---|
1089 | # avimages |
---|
1090 | sub pick_from_alta_vista_images { |
---|
1091 | my ( $timeout ) = @_; |
---|
1092 | |
---|
1093 | my $words = random_words(0); |
---|
1094 | my $page = (int(rand(9)) + 1); |
---|
1095 | my $search_url = $alta_vista_images_url . $words; |
---|
1096 | |
---|
1097 | if ($page > 1) { |
---|
1098 | $search_url .= "&pgno=" . $page; # page number |
---|
1099 | $search_url .= "&stq=" . (($page-1) * 12); # first hit result on page |
---|
1100 | } |
---|
1101 | |
---|
1102 | my ($search_hit_count, @subpages) = |
---|
1103 | pick_from_search_engine ($timeout, $search_url, $words); |
---|
1104 | |
---|
1105 | my @candidates = (); |
---|
1106 | foreach my $u (@subpages) { |
---|
1107 | |
---|
1108 | # avtext is encoding their URLs now. |
---|
1109 | next unless ($u =~ m@^/r.*\&r=([^&]+).*@); |
---|
1110 | $u = url_unquote($1); |
---|
1111 | |
---|
1112 | next unless ($u =~ m@^http://@i); # skip non-HTTP or relative URLs |
---|
1113 | next if ($u =~ m@[/.]altavista\.com\b@i); # skip altavista builtins |
---|
1114 | next if ($u =~ m@[/.]doubleclick\.net\b@i); # you cretins |
---|
1115 | next if ($u =~ m@[/.]clicktomarket\.com\b@i); # more cretins |
---|
1116 | |
---|
1117 | next if ($u =~ m@[/.]viewimages\.com\b@i); # stacked deck |
---|
1118 | next if ($u =~ m@[/.]gettyimages\.com\b@i); |
---|
1119 | |
---|
1120 | LOG ($verbose_filter, " candidate: $u"); |
---|
1121 | push @candidates, $u; |
---|
1122 | } |
---|
1123 | |
---|
1124 | return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1, |
---|
1125 | $timeout, @candidates); |
---|
1126 | } |
---|
1127 | |
---|
1128 | |
---|
1129 | |
---|
1130 | ############################################################################ |
---|
1131 | # |
---|
1132 | # Pick images by feeding random words into Google Image Search. |
---|
1133 | # By Charles Gales <gales@us.ibm.com> |
---|
1134 | # |
---|
1135 | ############################################################################ |
---|
1136 | |
---|
1137 | |
---|
1138 | my $google_images_url = "http://images.google.com/images" . |
---|
1139 | "?site=images" . # photos |
---|
1140 | "&btnG=Search" . # graphics |
---|
1141 | "&safe=off" . # no screening |
---|
1142 | "&imgsafe=off" . |
---|
1143 | "&q="; |
---|
1144 | |
---|
1145 | # googleimgs |
---|
1146 | sub pick_from_google_images { |
---|
1147 | my ( $timeout ) = @_; |
---|
1148 | |
---|
1149 | my $words = random_word; # only one word for Google |
---|
1150 | my $page = (int(rand(9)) + 1); |
---|
1151 | my $num = 20; # 20 images per page |
---|
1152 | my $search_url = $google_images_url . $words; |
---|
1153 | |
---|
1154 | if ($page > 1) { |
---|
1155 | $search_url .= "&start=" . $page*$num; # page number |
---|
1156 | $search_url .= "&num=" . $num; #images per page |
---|
1157 | } |
---|
1158 | |
---|
1159 | my ($search_hit_count, @subpages) = |
---|
1160 | pick_from_search_engine ($timeout, $search_url, $words); |
---|
1161 | |
---|
1162 | my @candidates = (); |
---|
1163 | foreach my $u (@subpages) { |
---|
1164 | next unless ($u =~ m@imgres\?imgurl@i); # All pics start with this |
---|
1165 | next if ($u =~ m@[/.]google\.com\b@i); # skip google builtins |
---|
1166 | |
---|
1167 | if ($u =~ m@^/imgres\?imgurl=(.*?)\&imgrefurl=(.*?)\&@) { |
---|
1168 | my $urlf = $2; |
---|
1169 | LOG ($verbose_filter, " candidate: $urlf"); |
---|
1170 | push @candidates, $urlf; |
---|
1171 | } |
---|
1172 | } |
---|
1173 | |
---|
1174 | return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1, |
---|
1175 | $timeout, @candidates); |
---|
1176 | } |
---|
1177 | |
---|
1178 | |
---|
1179 | |
---|
1180 | ############################################################################ |
---|
1181 | # |
---|
1182 | # Pick images by feeding random *numbers* into Google Image Search. |
---|
1183 | # By jwz, suggested by from Ian O'Donnell. |
---|
1184 | # |
---|
1185 | ############################################################################ |
---|
1186 | |
---|
1187 | |
---|
1188 | # googlenums |
---|
1189 | sub pick_from_google_image_numbers { |
---|
1190 | my ( $timeout ) = @_; |
---|
1191 | |
---|
1192 | my $max = 9999; |
---|
1193 | my $number = int(rand($max)); |
---|
1194 | |
---|
1195 | $number = sprintf("%04d", $number) |
---|
1196 | if (rand() < 0.3); |
---|
1197 | |
---|
1198 | my $words = "$number"; |
---|
1199 | my $page = (int(rand(40)) + 1); |
---|
1200 | my $num = 20; # 20 images per page |
---|
1201 | my $search_url = $google_images_url . $words; |
---|
1202 | |
---|
1203 | if ($page > 1) { |
---|
1204 | $search_url .= "&start=" . $page*$num; # page number |
---|
1205 | $search_url .= "&num=" . $num; #images per page |
---|
1206 | } |
---|
1207 | |
---|
1208 | my ($search_hit_count, @subpages) = |
---|
1209 | pick_from_search_engine ($timeout, $search_url, $words); |
---|
1210 | |
---|
1211 | my @candidates = (); |
---|
1212 | my %referers; |
---|
1213 | foreach my $u (@subpages) { |
---|
1214 | next unless ($u =~ m@imgres\?imgurl@i); # All pics start with this |
---|
1215 | next if ($u =~ m@[/.]google\.com\b@i); # skip google builtins |
---|
1216 | |
---|
1217 | if ($u =~ m@^/imgres\?imgurl=(.*?)\&imgrefurl=(.*?)\&@) { |
---|
1218 | my $ref = $2; |
---|
1219 | my $img = "http://$1"; |
---|
1220 | |
---|
1221 | LOG ($verbose_filter, " candidate: $ref"); |
---|
1222 | push @candidates, $img; |
---|
1223 | $referers{$img} = $ref; |
---|
1224 | } |
---|
1225 | } |
---|
1226 | |
---|
1227 | @candidates = depoison (@candidates); |
---|
1228 | return () if ($#candidates < 0); |
---|
1229 | my $i = int(rand($#candidates+1)); |
---|
1230 | my $img = $candidates[$i]; |
---|
1231 | my $ref = $referers{$img}; |
---|
1232 | |
---|
1233 | LOG ($verbose_load, "picked image " . ($i+1) . ": $img (on $ref)"); |
---|
1234 | return ($ref, $img); |
---|
1235 | } |
---|
1236 | |
---|
1237 | |
---|
1238 | |
---|
1239 | ############################################################################ |
---|
1240 | # |
---|
1241 | # Pick images by feeding random words into Alta Vista Text Search |
---|
1242 | # |
---|
1243 | ############################################################################ |
---|
1244 | |
---|
1245 | |
---|
1246 | my $alta_vista_url = "http://www.altavista.com/web/results" . |
---|
1247 | "?pg=aq" . |
---|
1248 | "&aqmode=s" . |
---|
1249 | "&filetype=html" . |
---|
1250 | "&sc=on" . # "site collapse" |
---|
1251 | "&nbq=50" . |
---|
1252 | "&aqo="; |
---|
1253 | |
---|
1254 | # avtext |
---|
1255 | sub pick_from_alta_vista_text { |
---|
1256 | my ( $timeout ) = @_; |
---|
1257 | |
---|
1258 | my $words = random_words(0); |
---|
1259 | my $page = (int(rand(9)) + 1); |
---|
1260 | my $search_url = $alta_vista_url . $words; |
---|
1261 | |
---|
1262 | if ($page > 1) { |
---|
1263 | $search_url .= "&pgno=" . $page; |
---|
1264 | $search_url .= "&stq=" . (($page-1) * 10); |
---|
1265 | } |
---|
1266 | |
---|
1267 | my ($search_hit_count, @subpages) = |
---|
1268 | pick_from_search_engine ($timeout, $search_url, $words); |
---|
1269 | |
---|
1270 | my @candidates = (); |
---|
1271 | foreach my $u (@subpages) { |
---|
1272 | |
---|
1273 | # Those altavista fuckers are playing really nasty redirection games |
---|
1274 | # these days: the filter your clicks through their site, but use |
---|
1275 | # onMouseOver to make it look like they're not! Well, it makes it |
---|
1276 | # easier for us to identify search results... |
---|
1277 | # |
---|
1278 | next unless ($u =~ m@^/r.*\&r=([^&]+).*@); |
---|
1279 | $u = url_unquote($1); |
---|
1280 | |
---|
1281 | LOG ($verbose_filter, " candidate: $u"); |
---|
1282 | push @candidates, $u; |
---|
1283 | } |
---|
1284 | |
---|
1285 | return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1, |
---|
1286 | $timeout, @candidates); |
---|
1287 | } |
---|
1288 | |
---|
1289 | |
---|
1290 | |
---|
1291 | ############################################################################ |
---|
1292 | # |
---|
1293 | # Pick images by feeding random words into Hotbot |
---|
1294 | # |
---|
1295 | ############################################################################ |
---|
1296 | |
---|
1297 | my $hotbot_search_url =("http://hotbot.lycos.com/default.asp" . |
---|
1298 | "?ca=w" . |
---|
1299 | "&descriptiontype=0" . |
---|
1300 | "&imagetoggle=1" . |
---|
1301 | "&matchmode=any" . |
---|
1302 | "&nummod=2" . |
---|
1303 | "&recordcount=50" . |
---|
1304 | "&sitegroup=1" . |
---|
1305 | "&stem=1" . |
---|
1306 | "&cobrand=undefined" . |
---|
1307 | "&query="); |
---|
1308 | |
---|
1309 | sub pick_from_hotbot_text { |
---|
1310 | my ( $timeout ) = @_; |
---|
1311 | |
---|
1312 | # lycos seems to always give us back dictionaries and word lists if |
---|
1313 | # we search for more than one word... |
---|
1314 | # |
---|
1315 | my $words = random_word(); |
---|
1316 | |
---|
1317 | my $start = int(rand(8)) * 10 + 1; |
---|
1318 | my $search_url = $hotbot_search_url . $words . "&first=$start&page=more"; |
---|
1319 | |
---|
1320 | my ($search_hit_count, @subpages) = |
---|
1321 | pick_from_search_engine ($timeout, $search_url, $words); |
---|
1322 | |
---|
1323 | my @candidates = (); |
---|
1324 | foreach my $u (@subpages) { |
---|
1325 | |
---|
1326 | # Hotbot plays redirection games too |
---|
1327 | next unless ($u =~ m@/director.asp\?.*\btarget=([^&]+)@); |
---|
1328 | $u = url_decode($1); |
---|
1329 | |
---|
1330 | LOG ($verbose_filter, " candidate: $u"); |
---|
1331 | push @candidates, $u; |
---|
1332 | } |
---|
1333 | |
---|
1334 | return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1, |
---|
1335 | $timeout, @candidates); |
---|
1336 | } |
---|
1337 | |
---|
1338 | |
---|
1339 | |
---|
1340 | ############################################################################ |
---|
1341 | # |
---|
1342 | # Pick images by feeding random words into Lycos |
---|
1343 | # |
---|
1344 | ############################################################################ |
---|
1345 | |
---|
1346 | my $lycos_search_url = "http://search.lycos.com/default.asp" . |
---|
1347 | "?lpv=1" . |
---|
1348 | "&loc=searchhp" . |
---|
1349 | "&tab=web" . |
---|
1350 | "&query="; |
---|
1351 | |
---|
1352 | sub pick_from_lycos_text { |
---|
1353 | my ( $timeout ) = @_; |
---|
1354 | |
---|
1355 | # lycos seems to always give us back dictionaries and word lists if |
---|
1356 | # we search for more than one word... |
---|
1357 | # |
---|
1358 | my $words = random_word(); |
---|
1359 | |
---|
1360 | my $start = int(rand(8)) * 10 + 1; |
---|
1361 | my $search_url = $lycos_search_url . $words . "&first=$start&page=more"; |
---|
1362 | |
---|
1363 | my ($search_hit_count, @subpages) = |
---|
1364 | pick_from_search_engine ($timeout, $search_url, $words); |
---|
1365 | |
---|
1366 | my @candidates = (); |
---|
1367 | foreach my $u (@subpages) { |
---|
1368 | |
---|
1369 | # Lycos plays redirection games. |
---|
1370 | next unless ($u =~ m@^http://click.lycos.com/director.asp |
---|
1371 | .* |
---|
1372 | \btarget=([^&]+) |
---|
1373 | .* |
---|
1374 | @x); |
---|
1375 | $u = url_decode($1); |
---|
1376 | |
---|
1377 | LOG ($verbose_filter, " candidate: $u"); |
---|
1378 | push @candidates, $u; |
---|
1379 | } |
---|
1380 | |
---|
1381 | return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1, |
---|
1382 | $timeout, @candidates); |
---|
1383 | } |
---|
1384 | |
---|
1385 | |
---|
1386 | |
---|
1387 | ############################################################################ |
---|
1388 | # |
---|
1389 | # Pick images by feeding random words into news.yahoo.com |
---|
1390 | # |
---|
1391 | ############################################################################ |
---|
1392 | |
---|
1393 | my $yahoo_news_url = "http://search.news.yahoo.com/search/news" . |
---|
1394 | "?a=1" . |
---|
1395 | "&c=news_photos" . |
---|
1396 | "&s=-%24s%2C-date" . |
---|
1397 | "&n=100" . |
---|
1398 | "&o=o" . |
---|
1399 | "&2=" . |
---|
1400 | "&3=" . |
---|
1401 | "&p="; |
---|
1402 | |
---|
1403 | # yahoonews |
---|
1404 | sub pick_from_yahoo_news_text { |
---|
1405 | my ( $timeout ) = @_; |
---|
1406 | |
---|
1407 | my $words = random_words(0); |
---|
1408 | my $search_url = $yahoo_news_url . $words; |
---|
1409 | |
---|
1410 | my ($search_hit_count, @subpages) = |
---|
1411 | pick_from_search_engine ($timeout, $search_url, $words); |
---|
1412 | |
---|
1413 | my @candidates = (); |
---|
1414 | foreach my $u (@subpages) { |
---|
1415 | # only accept URLs on Yahoo's news site |
---|
1416 | next unless ($u =~ m@^http://dailynews\.yahoo\.com/@i || |
---|
1417 | $u =~ m@^http://story\.news\.yahoo\.com/@i); |
---|
1418 | |
---|
1419 | LOG ($verbose_filter, " candidate: $u"); |
---|
1420 | push @candidates, $u; |
---|
1421 | } |
---|
1422 | |
---|
1423 | return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1, |
---|
1424 | $timeout, @candidates); |
---|
1425 | } |
---|
1426 | |
---|
1427 | |
---|
1428 | |
---|
1429 | |
---|
1430 | ############################################################################ |
---|
1431 | # |
---|
1432 | # Pick images by waiting for driftnet to populate a temp dir with files. |
---|
1433 | # Requires driftnet version 0.1.5 or later. |
---|
1434 | # (Driftnet is a program by Chris Lightfoot that sniffs your local ethernet |
---|
1435 | # for images being downloaded by others.) |
---|
1436 | # Driftnet/webcollage integration by jwz. |
---|
1437 | # |
---|
1438 | ############################################################################ |
---|
1439 | |
---|
1440 | # driftnet |
---|
1441 | sub pick_from_driftnet { |
---|
1442 | my ( $timeout ) = @_; |
---|
1443 | |
---|
1444 | my $id = $driftnet_magic; |
---|
1445 | my $dir = $driftnet_dir; |
---|
1446 | my $start = time; |
---|
1447 | my $now; |
---|
1448 | |
---|
1449 | error ("\$driftnet_dir unset?") unless ($dir); |
---|
1450 | $dir =~ s@/+$@@; |
---|
1451 | |
---|
1452 | error ("$dir unreadable") unless (-d "$dir/."); |
---|
1453 | |
---|
1454 | $timeout = $http_timeout unless ($timeout); |
---|
1455 | $last_search = $id; |
---|
1456 | |
---|
1457 | while ($now = time, $now < $start + $timeout) { |
---|
1458 | local *DIR; |
---|
1459 | opendir (DIR, $dir) || error ("$dir: $!"); |
---|
1460 | while (my $file = readdir(DIR)) { |
---|
1461 | next if ($file =~ m/^\./); |
---|
1462 | $file = "$dir/$file"; |
---|
1463 | closedir DIR; |
---|
1464 | LOG ($verbose_load, "picked file $file ($id)"); |
---|
1465 | return ($id, $file); |
---|
1466 | } |
---|
1467 | closedir DIR; |
---|
1468 | } |
---|
1469 | LOG (($verbose_net || $verbose_load), "timed out for $id"); |
---|
1470 | return (); |
---|
1471 | } |
---|
1472 | |
---|
1473 | |
---|
1474 | sub get_driftnet_file { |
---|
1475 | my ($file) = @_; |
---|
1476 | |
---|
1477 | error ("\$driftnet_dir unset?") unless ($driftnet_dir); |
---|
1478 | |
---|
1479 | my $id = $driftnet_magic; |
---|
1480 | my $re = qr/$driftnet_dir/; |
---|
1481 | error ("$id: $file not in $driftnet_dir?") |
---|
1482 | unless ($file =~ m@^$re@o); |
---|
1483 | |
---|
1484 | local *IN; |
---|
1485 | open (IN, $file) || error ("$id: $file: $!"); |
---|
1486 | my $body = ''; |
---|
1487 | while (<IN>) { $body .= $_; } |
---|
1488 | close IN || error ("$id: $file: $!"); |
---|
1489 | unlink ($file) || error ("$id: $file: rm: $!"); |
---|
1490 | return ($id, $body); |
---|
1491 | } |
---|
1492 | |
---|
1493 | |
---|
1494 | sub spawn_driftnet { |
---|
1495 | my ($cmd) = @_; |
---|
1496 | |
---|
1497 | # make a directory to use. |
---|
1498 | while (1) { |
---|
1499 | my $tmp = $ENV{TEMPDIR} || "/tmp"; |
---|
1500 | $driftnet_dir = sprintf ("$tmp/driftcollage-%08x", rand(0xffffffff)); |
---|
1501 | LOG ($verbose_exec, "mkdir $driftnet_dir"); |
---|
1502 | last if mkdir ($driftnet_dir, 0700); |
---|
1503 | } |
---|
1504 | |
---|
1505 | if (! ($cmd =~ m/\s/)) { |
---|
1506 | # if the command didn't have any arguments in it, then it must be just |
---|
1507 | # a pointer to the executable. Append the default args to it. |
---|
1508 | my $dargs = $default_driftnet_cmd; |
---|
1509 | $dargs =~ s/^[^\s]+//; |
---|
1510 | $cmd .= $dargs; |
---|
1511 | } |
---|
1512 | |
---|
1513 | # point the driftnet command at our newly-minted private directory. |
---|
1514 | # |
---|
1515 | $cmd .= " -d $driftnet_dir"; |
---|
1516 | $cmd .= ">/dev/null" unless ($verbose_exec); |
---|
1517 | |
---|
1518 | my $pid = fork(); |
---|
1519 | if ($pid < 0) { error ("fork: $!\n"); } |
---|
1520 | if ($pid) { |
---|
1521 | # parent fork |
---|
1522 | push @pids_to_kill, $pid; |
---|
1523 | LOG ($verbose_exec, "forked for \"$cmd\""); |
---|
1524 | } else { |
---|
1525 | # child fork |
---|
1526 | nontrapping_system ($cmd) || error ("exec: $!"); |
---|
1527 | } |
---|
1528 | |
---|
1529 | # wait a bit, then make sure the process actually started up. |
---|
1530 | # |
---|
1531 | sleep (1); |
---|
1532 | error ("pid $pid failed to start \"$cmd\"") |
---|
1533 | unless (1 == kill (0, $pid)); |
---|
1534 | } |
---|
1535 | |
---|
1536 | |
---|
1537 | ############################################################################ |
---|
1538 | # |
---|
1539 | # Pick a random image in a random way |
---|
1540 | # |
---|
1541 | ############################################################################ |
---|
1542 | |
---|
1543 | |
---|
1544 | # Picks a random image on a random page, and returns two URLs: |
---|
1545 | # the page containing the image, and the image. |
---|
1546 | # Returns () if nothing found this time. |
---|
1547 | # |
---|
1548 | |
---|
1549 | sub pick_image { |
---|
1550 | my ( $timeout ) = @_; |
---|
1551 | |
---|
1552 | $current_state = "select"; |
---|
1553 | $load_method = "none"; |
---|
1554 | |
---|
1555 | my $n = int(rand(100)); |
---|
1556 | my $fn = undef; |
---|
1557 | my $total = 0; |
---|
1558 | my @rest = @search_methods; |
---|
1559 | |
---|
1560 | while (@rest) { |
---|
1561 | my $pct = shift @rest; |
---|
1562 | my $name = shift @rest; |
---|
1563 | my $tfn = shift @rest; |
---|
1564 | $total += $pct; |
---|
1565 | if ($total > $n && !defined($fn)) { |
---|
1566 | $fn = $tfn; |
---|
1567 | $current_state = $name; |
---|
1568 | $load_method = $current_state; |
---|
1569 | } |
---|
1570 | } |
---|
1571 | |
---|
1572 | if ($total != 100) { |
---|
1573 | error ("internal error: \@search_methods totals to $total%!"); |
---|
1574 | } |
---|
1575 | |
---|
1576 | record_attempt ($current_state); |
---|
1577 | return $fn->($timeout); |
---|
1578 | } |
---|
1579 | |
---|
1580 | |
---|
1581 | |
---|
1582 | ############################################################################ |
---|
1583 | # |
---|
1584 | # Statistics and logging |
---|
1585 | # |
---|
1586 | ############################################################################ |
---|
1587 | |
---|
1588 | sub timestr { |
---|
1589 | return strftime ("%H:%M:%S: ", localtime); |
---|
1590 | } |
---|
1591 | |
---|
1592 | sub blurb { |
---|
1593 | return "$progname: " . timestr() . "$current_state: "; |
---|
1594 | } |
---|
1595 | |
---|
1596 | sub error { |
---|
1597 | my ($err) = @_; |
---|
1598 | print STDERR blurb() . "$err\n"; |
---|
1599 | exit 1; |
---|
1600 | } |
---|
1601 | |
---|
1602 | |
---|
1603 | my $lastlog = ""; |
---|
1604 | |
---|
1605 | sub clearlog { |
---|
1606 | $lastlog = ""; |
---|
1607 | } |
---|
1608 | |
---|
1609 | sub showlog { |
---|
1610 | my $head = "$progname: DEBUG: "; |
---|
1611 | foreach (split (/\n/, $lastlog)) { |
---|
1612 | print STDERR "$head$_\n"; |
---|
1613 | } |
---|
1614 | $lastlog = ""; |
---|
1615 | } |
---|
1616 | |
---|
1617 | sub LOG { |
---|
1618 | my ($print, $msg) = @_; |
---|
1619 | my $blurb = timestr() . "$current_state: "; |
---|
1620 | $lastlog .= "$blurb$msg\n"; |
---|
1621 | print STDERR "$progname: $blurb$msg\n" if $print; |
---|
1622 | } |
---|
1623 | |
---|
1624 | |
---|
1625 | my %stats_attempts; |
---|
1626 | my %stats_successes; |
---|
1627 | my %stats_elapsed; |
---|
1628 | |
---|
1629 | my $last_state = undef; |
---|
1630 | sub record_attempt { |
---|
1631 | my ($name) = @_; |
---|
1632 | |
---|
1633 | if ($last_state) { |
---|
1634 | record_failure($last_state) unless ($image_succeeded > 0); |
---|
1635 | } |
---|
1636 | $last_state = $name; |
---|
1637 | |
---|
1638 | clearlog(); |
---|
1639 | report_performance(); |
---|
1640 | |
---|
1641 | start_timer($name); |
---|
1642 | $image_succeeded = 0; |
---|
1643 | $suppress_audit = 0; |
---|
1644 | } |
---|
1645 | |
---|
1646 | sub record_success { |
---|
1647 | my ($name, $url, $base) = @_; |
---|
1648 | if (defined($stats_successes{$name})) { |
---|
1649 | $stats_successes{$name}++; |
---|
1650 | } else { |
---|
1651 | $stats_successes{$name} = 1; |
---|
1652 | } |
---|
1653 | |
---|
1654 | stop_timer ($name, 1); |
---|
1655 | my $o = $current_state; |
---|
1656 | $current_state = $name; |
---|
1657 | save_recent_url ($url, $base); |
---|
1658 | $current_state = $o; |
---|
1659 | $image_succeeded = 1; |
---|
1660 | clearlog(); |
---|
1661 | } |
---|
1662 | |
---|
1663 | |
---|
1664 | sub record_failure { |
---|
1665 | my ($name) = @_; |
---|
1666 | |
---|
1667 | return if $image_succeeded; |
---|
1668 | |
---|
1669 | stop_timer ($name, 0); |
---|
1670 | if ($verbose_load && !$verbose_exec) { |
---|
1671 | |
---|
1672 | if ($suppress_audit) { |
---|
1673 | print STDERR "$progname: " . timestr() . "(audit log suppressed)\n"; |
---|
1674 | return; |
---|
1675 | } |
---|
1676 | |
---|
1677 | my $o = $current_state; |
---|
1678 | $current_state = "DEBUG"; |
---|
1679 | |
---|
1680 | my $line = "#" x 78; |
---|
1681 | print STDERR "\n\n\n"; |
---|
1682 | print STDERR ("#" x 78) . "\n"; |
---|
1683 | print STDERR blurb() . "failed to get an image. Full audit log:\n"; |
---|
1684 | print STDERR "\n"; |
---|
1685 | showlog(); |
---|
1686 | print STDERR ("-" x 78) . "\n"; |
---|
1687 | print STDERR "\n\n"; |
---|
1688 | |
---|
1689 | $current_state = $o; |
---|
1690 | } |
---|
1691 | $image_succeeded = 0; |
---|
1692 | } |
---|
1693 | |
---|
1694 | |
---|
1695 | |
---|
1696 | sub stats_of { |
---|
1697 | my ($name) = @_; |
---|
1698 | my $i = $stats_successes{$name}; |
---|
1699 | my $j = $stats_attempts{$name}; |
---|
1700 | $i = 0 unless $i; |
---|
1701 | $j = 0 unless $j; |
---|
1702 | return "" . ($j ? int($i * 100 / $j) : "0") . "%"; |
---|
1703 | } |
---|
1704 | |
---|
1705 | |
---|
1706 | my $current_start_time = 0; |
---|
1707 | |
---|
1708 | sub start_timer { |
---|
1709 | my ($name) = @_; |
---|
1710 | $current_start_time = time; |
---|
1711 | |
---|
1712 | if (defined($stats_attempts{$name})) { |
---|
1713 | $stats_attempts{$name}++; |
---|
1714 | } else { |
---|
1715 | $stats_attempts{$name} = 1; |
---|
1716 | } |
---|
1717 | if (!defined($stats_elapsed{$name})) { |
---|
1718 | $stats_elapsed{$name} = 0; |
---|
1719 | } |
---|
1720 | } |
---|
1721 | |
---|
1722 | sub stop_timer { |
---|
1723 | my ($name, $success) = @_; |
---|
1724 | $stats_elapsed{$name} += time - $current_start_time; |
---|
1725 | } |
---|
1726 | |
---|
1727 | |
---|
1728 | my $last_report_time = 0; |
---|
1729 | sub report_performance { |
---|
1730 | |
---|
1731 | return unless $verbose_warnings; |
---|
1732 | |
---|
1733 | my $now = time; |
---|
1734 | return unless ($now >= $last_report_time + $report_performance_interval); |
---|
1735 | my $ot = $last_report_time; |
---|
1736 | $last_report_time = $now; |
---|
1737 | |
---|
1738 | return if ($ot == 0); |
---|
1739 | |
---|
1740 | my $blurb = "$progname: " . timestr(); |
---|
1741 | |
---|
1742 | print STDERR "\n"; |
---|
1743 | print STDERR "${blurb}Current standings:\n"; |
---|
1744 | |
---|
1745 | foreach my $name (sort keys (%stats_attempts)) { |
---|
1746 | my $try = $stats_attempts{$name}; |
---|
1747 | my $suc = $stats_successes{$name} || 0; |
---|
1748 | my $pct = int($suc * 100 / $try); |
---|
1749 | my $secs = $stats_elapsed{$name}; |
---|
1750 | my $secs_link = int($secs / $try); |
---|
1751 | print STDERR sprintf ("$blurb %-12s %4s (%d/%d);\t %2d secs/link\n", |
---|
1752 | "$name:", "$pct%", $suc, $try, $secs_link); |
---|
1753 | } |
---|
1754 | } |
---|
1755 | |
---|
1756 | |
---|
1757 | |
---|
1758 | my $max_recent_images = 400; |
---|
1759 | my $max_recent_sites = 20; |
---|
1760 | my @recent_images = (); |
---|
1761 | my @recent_sites = (); |
---|
1762 | |
---|
1763 | sub save_recent_url { |
---|
1764 | my ($url, $base) = @_; |
---|
1765 | |
---|
1766 | return unless ($verbose_warnings); |
---|
1767 | |
---|
1768 | $_ = $url; |
---|
1769 | my ($site) = m@^http://([^ \t\n\r/:]+)@; |
---|
1770 | return unless defined ($site); |
---|
1771 | |
---|
1772 | if ($base eq $driftnet_magic) { |
---|
1773 | $site = $driftnet_magic; |
---|
1774 | @recent_images = (); |
---|
1775 | } |
---|
1776 | |
---|
1777 | my $done = 0; |
---|
1778 | foreach (@recent_images) { |
---|
1779 | if ($_ eq $url) { |
---|
1780 | print STDERR blurb() . "WARNING: recently-duplicated image: $url" . |
---|
1781 | " (on $base via $last_search)\n"; |
---|
1782 | $done = 1; |
---|
1783 | last; |
---|
1784 | } |
---|
1785 | } |
---|
1786 | |
---|
1787 | # suppress "duplicate site" warning via %warningless_sites. |
---|
1788 | # |
---|
1789 | if ($warningless_sites{$site}) { |
---|
1790 | $done = 1; |
---|
1791 | } elsif ($site =~ m@([^.]+\.[^.]+\.[^.]+)$@ && |
---|
1792 | $warningless_sites{$1}) { |
---|
1793 | $done = 1; |
---|
1794 | } elsif ($site =~ m@([^.]+\.[^.]+)$@ && |
---|
1795 | $warningless_sites{$1}) { |
---|
1796 | $done = 1; |
---|
1797 | } |
---|
1798 | |
---|
1799 | if (!$done) { |
---|
1800 | foreach (@recent_sites) { |
---|
1801 | if ($_ eq $site) { |
---|
1802 | print STDERR blurb() . "WARNING: recently-duplicated site: $site" . |
---|
1803 | " ($url on $base via $last_search)\n"; |
---|
1804 | last; |
---|
1805 | } |
---|
1806 | } |
---|
1807 | } |
---|
1808 | |
---|
1809 | push @recent_images, $url; |
---|
1810 | push @recent_sites, $site; |
---|
1811 | shift @recent_images if ($#recent_images >= $max_recent_images); |
---|
1812 | shift @recent_sites if ($#recent_sites >= $max_recent_sites); |
---|
1813 | } |
---|
1814 | |
---|
1815 | |
---|
1816 | |
---|
1817 | ############################################################################## |
---|
1818 | # |
---|
1819 | # other utilities |
---|
1820 | # |
---|
1821 | ############################################################################## |
---|
1822 | |
---|
1823 | # Does %-decoding. |
---|
1824 | # |
---|
1825 | sub url_decode { |
---|
1826 | ($_) = @_; |
---|
1827 | tr/+/ /; |
---|
1828 | s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; |
---|
1829 | return $_; |
---|
1830 | } |
---|
1831 | |
---|
1832 | |
---|
1833 | # Given the raw body of a GIF document, returns the dimensions of the image. |
---|
1834 | # |
---|
1835 | sub gif_size { |
---|
1836 | my ($body) = @_; |
---|
1837 | my $type = substr($body, 0, 6); |
---|
1838 | my $s; |
---|
1839 | return () unless ($type =~ /GIF8[7,9]a/); |
---|
1840 | $s = substr ($body, 6, 10); |
---|
1841 | my ($a,$b,$c,$d) = unpack ("C"x4, $s); |
---|
1842 | return (($b<<8|$a), ($d<<8|$c)); |
---|
1843 | } |
---|
1844 | |
---|
1845 | # Given the raw body of a JPEG document, returns the dimensions of the image. |
---|
1846 | # |
---|
1847 | sub jpeg_size { |
---|
1848 | my ($body) = @_; |
---|
1849 | my $i = 0; |
---|
1850 | my $L = length($body); |
---|
1851 | |
---|
1852 | my $c1 = substr($body, $i, 1); $i++; |
---|
1853 | my $c2 = substr($body, $i, 1); $i++; |
---|
1854 | return () unless (ord($c1) == 0xFF && ord($c2) == 0xD8); |
---|
1855 | |
---|
1856 | my $ch = "0"; |
---|
1857 | while (ord($ch) != 0xDA && $i < $L) { |
---|
1858 | # Find next marker, beginning with 0xFF. |
---|
1859 | while (ord($ch) != 0xFF) { |
---|
1860 | return () if (length($body) <= $i); |
---|
1861 | $ch = substr($body, $i, 1); $i++; |
---|
1862 | } |
---|
1863 | # markers can be padded with any number of 0xFF. |
---|
1864 | while (ord($ch) == 0xFF) { |
---|
1865 | return () if (length($body) <= $i); |
---|
1866 | $ch = substr($body, $i, 1); $i++; |
---|
1867 | } |
---|
1868 | |
---|
1869 | # $ch contains the value of the marker. |
---|
1870 | my $marker = ord($ch); |
---|
1871 | |
---|
1872 | if (($marker >= 0xC0) && |
---|
1873 | ($marker <= 0xCF) && |
---|
1874 | ($marker != 0xC4) && |
---|
1875 | ($marker != 0xCC)) { # it's a SOFn marker |
---|
1876 | $i += 3; |
---|
1877 | return () if (length($body) <= $i); |
---|
1878 | my $s = substr($body, $i, 4); $i += 4; |
---|
1879 | my ($a,$b,$c,$d) = unpack("C"x4, $s); |
---|
1880 | return (($c<<8|$d), ($a<<8|$b)); |
---|
1881 | |
---|
1882 | } else { |
---|
1883 | # We must skip variables, since FFs in variable names aren't |
---|
1884 | # valid JPEG markers. |
---|
1885 | return () if (length($body) <= $i); |
---|
1886 | my $s = substr($body, $i, 2); $i += 2; |
---|
1887 | my ($c1, $c2) = unpack ("C"x2, $s); |
---|
1888 | my $length = ($c1 << 8) | $c2; |
---|
1889 | return () if ($length < 2); |
---|
1890 | $i += $length-2; |
---|
1891 | } |
---|
1892 | } |
---|
1893 | return (); |
---|
1894 | } |
---|
1895 | |
---|
1896 | # Given the raw body of a GIF or JPEG document, returns the dimensions of |
---|
1897 | # the image. |
---|
1898 | # |
---|
1899 | sub image_size { |
---|
1900 | my ($body) = @_; |
---|
1901 | my ($w, $h) = gif_size ($body); |
---|
1902 | if ($w && $h) { return ($w, $h); } |
---|
1903 | return jpeg_size ($body); |
---|
1904 | } |
---|
1905 | |
---|
1906 | |
---|
1907 | # returns the full path of the named program, or undef. |
---|
1908 | # |
---|
1909 | sub which { |
---|
1910 | my ($prog) = @_; |
---|
1911 | foreach (split (/:/, $ENV{PATH})) { |
---|
1912 | if (-x "$_/$prog") { |
---|
1913 | return $prog; |
---|
1914 | } |
---|
1915 | } |
---|
1916 | return undef; |
---|
1917 | } |
---|
1918 | |
---|
1919 | |
---|
1920 | # Like rand(), but chooses numbers with a bell curve distribution. |
---|
1921 | sub bellrand { |
---|
1922 | ($_) = @_; |
---|
1923 | $_ = 1.0 unless defined($_); |
---|
1924 | $_ /= 3.0; |
---|
1925 | return (rand($_) + rand($_) + rand($_)); |
---|
1926 | } |
---|
1927 | |
---|
1928 | |
---|
1929 | sub signal_cleanup { |
---|
1930 | my ($sig) = @_; |
---|
1931 | print STDERR blurb() . (defined($sig) |
---|
1932 | ? "caught signal $sig." |
---|
1933 | : "exiting.") |
---|
1934 | . "\n" |
---|
1935 | if ($verbose_exec); |
---|
1936 | |
---|
1937 | x_cleanup(); |
---|
1938 | |
---|
1939 | if (@pids_to_kill) { |
---|
1940 | print STDERR blurb() . "killing: " . join(' ', @pids_to_kill) . "\n"; |
---|
1941 | kill ('TERM', @pids_to_kill); |
---|
1942 | } |
---|
1943 | |
---|
1944 | exit 1; |
---|
1945 | } |
---|
1946 | |
---|
1947 | ############################################################################## |
---|
1948 | # |
---|
1949 | # Generating a list of urls only |
---|
1950 | # |
---|
1951 | ############################################################################## |
---|
1952 | |
---|
1953 | sub url_only_output { |
---|
1954 | do { |
---|
1955 | my ($base, $img) = pick_image; |
---|
1956 | if ($img) { |
---|
1957 | $base =~ s/ /%20/g; |
---|
1958 | $img =~ s/ /%20/g; |
---|
1959 | print "$img $base\n"; |
---|
1960 | } |
---|
1961 | } while (1); |
---|
1962 | } |
---|
1963 | |
---|
1964 | ############################################################################## |
---|
1965 | # |
---|
1966 | # Running as an xscreensaver module |
---|
1967 | # |
---|
1968 | ############################################################################## |
---|
1969 | |
---|
1970 | my $image_ppm = ($ENV{TMPDIR} ? $ENV{TMPDIR} : "/tmp") . "/webcollage." . $$; |
---|
1971 | my $image_tmp1 = $image_ppm . "-1"; |
---|
1972 | my $image_tmp2 = $image_ppm . "-2"; |
---|
1973 | |
---|
1974 | my $filter_cmd = undef; |
---|
1975 | my $post_filter_cmd = undef; |
---|
1976 | my $background = undef; |
---|
1977 | |
---|
1978 | my $img_width; # size of the image being generated. |
---|
1979 | my $img_height; |
---|
1980 | |
---|
1981 | my $delay = 2; |
---|
1982 | |
---|
1983 | sub x_cleanup { |
---|
1984 | unlink $image_ppm, $image_tmp1, $image_tmp2; |
---|
1985 | } |
---|
1986 | |
---|
1987 | |
---|
1988 | # Like system, but prints status about exit codes, and kills this process |
---|
1989 | # with whatever signal killed the sub-process, if any. |
---|
1990 | # |
---|
1991 | sub nontrapping_system { |
---|
1992 | $! = 0; |
---|
1993 | |
---|
1994 | $_ = join(" ", @_); |
---|
1995 | s/\"[^\"]+\"/\"...\"/g; |
---|
1996 | |
---|
1997 | LOG ($verbose_exec, "executing \"$_\""); |
---|
1998 | |
---|
1999 | my $rc = system @_; |
---|
2000 | |
---|
2001 | if ($rc == 0) { |
---|
2002 | LOG ($verbose_exec, "subproc exited normally."); |
---|
2003 | } elsif (($rc & 0xff) == 0) { |
---|
2004 | $rc >>= 8; |
---|
2005 | LOG ($verbose_exec, "subproc exited with status $rc."); |
---|
2006 | } else { |
---|
2007 | if ($rc & 0x80) { |
---|
2008 | LOG ($verbose_exec, "subproc dumped core."); |
---|
2009 | $rc &= ~0x80; |
---|
2010 | } |
---|
2011 | LOG ($verbose_exec, "subproc died with signal $rc."); |
---|
2012 | # die that way ourselves. |
---|
2013 | kill $rc, $$; |
---|
2014 | } |
---|
2015 | |
---|
2016 | return $rc; |
---|
2017 | } |
---|
2018 | |
---|
2019 | |
---|
2020 | # Given the URL of a GIF or JPEG image, and the body of that image, writes a |
---|
2021 | # PPM to the given output file. Returns the width/height of the image if |
---|
2022 | # successful. |
---|
2023 | # |
---|
2024 | sub image_to_pnm { |
---|
2025 | my ($url, $body, $output) = @_; |
---|
2026 | my ($cmd, $cmd2, $w, $h); |
---|
2027 | |
---|
2028 | if ((@_ = gif_size ($body))) { |
---|
2029 | ($w, $h) = @_; |
---|
2030 | $cmd = "giftopnm"; |
---|
2031 | } elsif ((@_ = jpeg_size ($body))) { |
---|
2032 | ($w, $h) = @_; |
---|
2033 | $cmd = "djpeg"; |
---|
2034 | } else { |
---|
2035 | LOG (($verbose_pbm || $verbose_load), |
---|
2036 | "not a GIF or JPG" . |
---|
2037 | (($body =~ m@<(base|html|head|body|script|table|a href)>@i) |
---|
2038 | ? " (looks like HTML)" : "") . |
---|
2039 | ": $url"); |
---|
2040 | $suppress_audit = 1; |
---|
2041 | return (); |
---|
2042 | } |
---|
2043 | |
---|
2044 | $cmd2 = "exec $cmd"; # yes, this really is necessary. if we don't |
---|
2045 | # do this, the process doesn't die properly. |
---|
2046 | if (!$verbose_pbm) { |
---|
2047 | # |
---|
2048 | # We get a "giftopnm: got a 'Application Extension' extension" |
---|
2049 | # warning any time it's an animgif. |
---|
2050 | # |
---|
2051 | # Note that "giftopnm: EOF / read error on image data" is not |
---|
2052 | # always a fatal error -- sometimes the image looks fine anyway. |
---|
2053 | # |
---|
2054 | $cmd2 .= " 2>/dev/null"; |
---|
2055 | } |
---|
2056 | |
---|
2057 | # There exist corrupted GIF and JPEG files that can make giftopnm and |
---|
2058 | # djpeg lose their minds and go into a loop. So this gives those programs |
---|
2059 | # a small timeout -- if they don't complete in time, kill them. |
---|
2060 | # |
---|
2061 | my $pid; |
---|
2062 | @_ = eval { |
---|
2063 | my $timed_out; |
---|
2064 | |
---|
2065 | local $SIG{ALRM} = sub { |
---|
2066 | LOG ($verbose_pbm, |
---|
2067 | "timed out ($cvt_timeout) for $cmd on \"$url\" in pid $pid"); |
---|
2068 | kill ('TERM', $pid) if ($pid); |
---|
2069 | $timed_out = 1; |
---|
2070 | $body = undef; |
---|
2071 | }; |
---|
2072 | |
---|
2073 | if (($pid = open(PIPE, "| $cmd2 > $output"))) { |
---|
2074 | $timed_out = 0; |
---|
2075 | alarm $cvt_timeout; |
---|
2076 | print PIPE $body; |
---|
2077 | $body = undef; |
---|
2078 | close PIPE; |
---|
2079 | |
---|
2080 | LOG ($verbose_exec, "awaiting $pid"); |
---|
2081 | waitpid ($pid, 0); |
---|
2082 | LOG ($verbose_exec, "$pid completed"); |
---|
2083 | |
---|
2084 | my $size = (stat($output))[7]; |
---|
2085 | $size = -1 unless defined($size); |
---|
2086 | if ($size < 5) { |
---|
2087 | LOG ($verbose_pbm, "$cmd on ${w}x$h \"$url\" failed ($size bytes)"); |
---|
2088 | return (); |
---|
2089 | } |
---|
2090 | |
---|
2091 | LOG ($verbose_pbm, "created ${w}x$h $output ($cmd)"); |
---|
2092 | return ($w, $h); |
---|
2093 | } else { |
---|
2094 | print STDERR blurb() . "$cmd failed: $!\n"; |
---|
2095 | return (); |
---|
2096 | } |
---|
2097 | }; |
---|
2098 | die if ($@ && $@ ne "alarm\n"); # propagate errors |
---|
2099 | if ($@) { |
---|
2100 | # timed out |
---|
2101 | $body = undef; |
---|
2102 | return (); |
---|
2103 | } else { |
---|
2104 | # didn't |
---|
2105 | alarm 0; |
---|
2106 | $body = undef; |
---|
2107 | return @_; |
---|
2108 | } |
---|
2109 | } |
---|
2110 | |
---|
2111 | sub pick_root_displayer { |
---|
2112 | my @names = (); |
---|
2113 | |
---|
2114 | foreach my $cmd (@root_displayers) { |
---|
2115 | $_ = $cmd; |
---|
2116 | my ($name) = m/^([^ ]+)/; |
---|
2117 | push @names, "\"$name\""; |
---|
2118 | LOG ($verbose_exec, "looking for $name..."); |
---|
2119 | foreach my $dir (split (/:/, $ENV{PATH})) { |
---|
2120 | LOG ($verbose_exec, " checking $dir/$name"); |
---|
2121 | return $cmd if (-x "$dir/$name"); |
---|
2122 | } |
---|
2123 | } |
---|
2124 | |
---|
2125 | $names[$#names] = "or " . $names[$#names]; |
---|
2126 | error "none of: " . join (", ", @names) . " were found on \$PATH."; |
---|
2127 | } |
---|
2128 | |
---|
2129 | |
---|
2130 | my $ppm_to_root_window_cmd = undef; |
---|
2131 | |
---|
2132 | |
---|
2133 | sub x_or_pbm_output { |
---|
2134 | |
---|
2135 | # Check for our helper program, to see whether we need to use PPM pipelines. |
---|
2136 | # |
---|
2137 | $_ = "webcollage-helper"; |
---|
2138 | if (defined ($webcollage_helper) || which ($_)) { |
---|
2139 | $webcollage_helper = $_ unless (defined($webcollage_helper)); |
---|
2140 | LOG ($verbose_pbm, "found \"$webcollage_helper\""); |
---|
2141 | $webcollage_helper .= " -v"; |
---|
2142 | } else { |
---|
2143 | LOG (($verbose_pbm || $verbose_load), "no $_ program"); |
---|
2144 | } |
---|
2145 | |
---|
2146 | # make sure the various programs we execute exist, right up front. |
---|
2147 | # |
---|
2148 | my @progs = ("ppmmake"); # always need this one |
---|
2149 | |
---|
2150 | if (!defined($webcollage_helper)) { |
---|
2151 | # Only need these others if we don't have the helper. |
---|
2152 | @progs = (@progs, "giftopnm", "djpeg", "pnmpaste", "pnmscale", "pnmcut"); |
---|
2153 | } |
---|
2154 | |
---|
2155 | foreach (@progs) { |
---|
2156 | which ($_) || error "$_ not found on \$PATH."; |
---|
2157 | } |
---|
2158 | |
---|
2159 | # find a root-window displayer program. |
---|
2160 | # |
---|
2161 | $ppm_to_root_window_cmd = pick_root_displayer(); |
---|
2162 | |
---|
2163 | if (!$img_width || !$img_height) { |
---|
2164 | $_ = "xdpyinfo"; |
---|
2165 | which ($_) || error "$_ not found on \$PATH."; |
---|
2166 | $_ = `$_`; |
---|
2167 | ($img_width, $img_height) = m/dimensions: *(\d+)x(\d+) /; |
---|
2168 | if (!defined($img_height)) { |
---|
2169 | error "xdpyinfo failed."; |
---|
2170 | } |
---|
2171 | } |
---|
2172 | |
---|
2173 | my $bgcolor = "#000000"; |
---|
2174 | my $bgimage = undef; |
---|
2175 | |
---|
2176 | if ($background) { |
---|
2177 | if ($background =~ m/^\#[0-9a-f]+$/i) { |
---|
2178 | $bgcolor = $background; |
---|
2179 | |
---|
2180 | } elsif (-r $background) { |
---|
2181 | $bgimage = $background; |
---|
2182 | |
---|
2183 | } elsif (! $background =~ m@^[-a-z0-9 ]+$@i) { |
---|
2184 | error "not a color or readable file: $background"; |
---|
2185 | |
---|
2186 | } else { |
---|
2187 | # default to assuming it's a color |
---|
2188 | $bgcolor = $background; |
---|
2189 | } |
---|
2190 | } |
---|
2191 | |
---|
2192 | # Create the sold-colored base image. |
---|
2193 | # |
---|
2194 | $_ = "ppmmake '$bgcolor' $img_width $img_height"; |
---|
2195 | LOG ($verbose_pbm, "creating base image: $_"); |
---|
2196 | nontrapping_system "$_ > $image_ppm"; |
---|
2197 | |
---|
2198 | # Paste the default background image in the middle of it. |
---|
2199 | # |
---|
2200 | if ($bgimage) { |
---|
2201 | my ($iw, $ih); |
---|
2202 | |
---|
2203 | my $body = ""; |
---|
2204 | local *IMG; |
---|
2205 | open(IMG, "<$bgimage") || error "couldn't open $bgimage: $!"; |
---|
2206 | my $cmd; |
---|
2207 | while (<IMG>) { $body .= $_; } |
---|
2208 | close (IMG); |
---|
2209 | |
---|
2210 | if ((@_ = gif_size ($body))) { |
---|
2211 | ($iw, $ih) = @_; |
---|
2212 | $cmd = "giftopnm |"; |
---|
2213 | |
---|
2214 | } elsif ((@_ = jpeg_size ($body))) { |
---|
2215 | ($iw, $ih) = @_; |
---|
2216 | $cmd = "djpeg |"; |
---|
2217 | |
---|
2218 | } elsif ($body =~ m/^P\d\n(\d+) (\d+)\n/) { |
---|
2219 | $iw = $1; |
---|
2220 | $ih = $2; |
---|
2221 | $cmd = ""; |
---|
2222 | |
---|
2223 | } else { |
---|
2224 | error "$bgimage is not a GIF, JPEG, or PPM."; |
---|
2225 | } |
---|
2226 | |
---|
2227 | my $x = int (($img_width - $iw) / 2); |
---|
2228 | my $y = int (($img_height - $ih) / 2); |
---|
2229 | LOG ($verbose_pbm, |
---|
2230 | "pasting $bgimage (${iw}x$ih) into base image at $x,$y"); |
---|
2231 | |
---|
2232 | $cmd .= "pnmpaste - $x $y $image_ppm > $image_tmp1"; |
---|
2233 | open (IMG, "| $cmd") || error "running $cmd: $!"; |
---|
2234 | print IMG $body; |
---|
2235 | $body = undef; |
---|
2236 | close (IMG); |
---|
2237 | LOG ($verbose_exec, "subproc exited normally."); |
---|
2238 | rename ($image_tmp1, $image_ppm) || |
---|
2239 | error "renaming $image_tmp1 to $image_ppm: $!"; |
---|
2240 | } |
---|
2241 | |
---|
2242 | clearlog(); |
---|
2243 | |
---|
2244 | while (1) { |
---|
2245 | my ($base, $img) = pick_image(); |
---|
2246 | my $source = $current_state; |
---|
2247 | $current_state = "loadimage"; |
---|
2248 | if ($img) { |
---|
2249 | my ($headers, $body) = get_document ($img, $base); |
---|
2250 | if ($body) { |
---|
2251 | paste_image ($base, $img, $body, $source); |
---|
2252 | $body = undef; |
---|
2253 | } |
---|
2254 | } |
---|
2255 | $current_state = "idle"; |
---|
2256 | $load_method = "none"; |
---|
2257 | |
---|
2258 | unlink $image_tmp1, $image_tmp2; |
---|
2259 | sleep $delay; |
---|
2260 | } |
---|
2261 | } |
---|
2262 | |
---|
2263 | sub paste_image { |
---|
2264 | my ($base, $img, $body, $source) = @_; |
---|
2265 | |
---|
2266 | $current_state = "paste"; |
---|
2267 | |
---|
2268 | $suppress_audit = 0; |
---|
2269 | |
---|
2270 | LOG ($verbose_pbm, "got $img (" . length($body) . ")"); |
---|
2271 | |
---|
2272 | my ($iw, $ih); |
---|
2273 | |
---|
2274 | # If we are using the webcollage-helper, then we do not need to convert this |
---|
2275 | # image to a PPM. But, if we're using a filter command, we still must, since |
---|
2276 | # that's what the filters expect (webcollage-helper can read PPMs, so that's |
---|
2277 | # fine.) |
---|
2278 | # |
---|
2279 | if (defined ($webcollage_helper) && |
---|
2280 | !defined ($filter_cmd)) { |
---|
2281 | |
---|
2282 | ($iw, $ih) = image_size ($body); |
---|
2283 | if (!$iw || !$ih) { |
---|
2284 | LOG (($verbose_pbm || $verbose_load), |
---|
2285 | "not a GIF or JPG" . |
---|
2286 | (($body =~ m@<(base|html|head|body|script|table|a href)>@i) |
---|
2287 | ? " (looks like HTML)" : "") . |
---|
2288 | ": $img"); |
---|
2289 | $suppress_audit = 1; |
---|
2290 | $body = undef; |
---|
2291 | return 0; |
---|
2292 | } |
---|
2293 | |
---|
2294 | local *OUT; |
---|
2295 | open (OUT, ">$image_tmp1") || error ("writing $image_tmp1: $!"); |
---|
2296 | print OUT $body || error ("writing $image_tmp1: $!"); |
---|
2297 | close OUT || error ("writing $image_tmp1: $!"); |
---|
2298 | |
---|
2299 | } else { |
---|
2300 | ($iw, $ih) = image_to_pnm ($img, $body, $image_tmp1); |
---|
2301 | $body = undef; |
---|
2302 | if (!$iw || !$ih) { |
---|
2303 | LOG ($verbose_pbm, "unable to make PBM from $img"); |
---|
2304 | return 0; |
---|
2305 | } |
---|
2306 | } |
---|
2307 | |
---|
2308 | record_success ($load_method, $img, $base); |
---|
2309 | |
---|
2310 | |
---|
2311 | my $ow = $iw; # used only for error messages |
---|
2312 | my $oh = $ih; |
---|
2313 | |
---|
2314 | # don't just tack this onto the front of the pipeline -- we want it to |
---|
2315 | # be able to change the size of the input image. |
---|
2316 | # |
---|
2317 | if ($filter_cmd) { |
---|
2318 | LOG ($verbose_pbm, "running $filter_cmd"); |
---|
2319 | |
---|
2320 | my $rc = nontrapping_system "($filter_cmd) < $image_tmp1 >$image_tmp2"; |
---|
2321 | if ($rc != 0) { |
---|
2322 | LOG(($verbose_pbm || $verbose_load), "failed command: \"$filter_cmd\""); |
---|
2323 | LOG(($verbose_pbm || $verbose_load), "failed URL: \"$img\" (${ow}x$oh)"); |
---|
2324 | return; |
---|
2325 | } |
---|
2326 | rename ($image_tmp2, $image_tmp1); |
---|
2327 | |
---|
2328 | # re-get the width/height in case the filter resized it. |
---|
2329 | local *IMG; |
---|
2330 | open(IMG, "<$image_tmp1") || return 0; |
---|
2331 | $_ = <IMG>; |
---|
2332 | $_ = <IMG>; |
---|
2333 | ($iw, $ih) = m/^(\d+) (\d+)$/; |
---|
2334 | close (IMG); |
---|
2335 | return 0 unless ($iw && $ih); |
---|
2336 | } |
---|
2337 | |
---|
2338 | my $target_w = $img_width; |
---|
2339 | my $target_h = $img_height; |
---|
2340 | |
---|
2341 | my $cmd = ""; |
---|
2342 | my $scale = 1.0; |
---|
2343 | |
---|
2344 | |
---|
2345 | # Usually scale the image to fit on the screen -- but sometimes scale it |
---|
2346 | # to fit on half or a quarter of the screen. Note that we don't merely |
---|
2347 | # scale it to fit, we instead cut it in half until it fits -- that should |
---|
2348 | # give a wider distribution of sizes. |
---|
2349 | # |
---|
2350 | if (rand() < 0.3) { $target_w /= 2; $target_h /= 2; $scale /= 2; } |
---|
2351 | if (rand() < 0.3) { $target_w /= 2; $target_h /= 2; $scale /= 2; } |
---|
2352 | |
---|
2353 | if ($iw > $target_w || $ih > $target_h) { |
---|
2354 | while ($iw > $target_w || |
---|
2355 | $ih > $target_h) { |
---|
2356 | $iw = int($iw / 2); |
---|
2357 | $ih = int($ih / 2); |
---|
2358 | } |
---|
2359 | if ($iw <= 10 || $ih <= 10) { |
---|
2360 | LOG ($verbose_pbm, "scaling to ${iw}x$ih would have been bogus."); |
---|
2361 | return 0; |
---|
2362 | } |
---|
2363 | |
---|
2364 | LOG ($verbose_pbm, "scaling to ${iw}x$ih"); |
---|
2365 | |
---|
2366 | $cmd .= " | pnmscale -xsize $iw -ysize $ih"; |
---|
2367 | } |
---|
2368 | |
---|
2369 | |
---|
2370 | my $src = $image_tmp1; |
---|
2371 | |
---|
2372 | my $crop_x = 0; # the sub-rectangle of the image |
---|
2373 | my $crop_y = 0; # that we will actually paste. |
---|
2374 | my $crop_w = $iw; |
---|
2375 | my $crop_h = $ih; |
---|
2376 | |
---|
2377 | # The chance that we will randomly crop out a section of an image starts |
---|
2378 | # out fairly low, but goes up for images that are very large, or images |
---|
2379 | # that have ratios that make them look like banners (we try to avoid |
---|
2380 | # banner images entirely, but they slip through when the IMG tags didn't |
---|
2381 | # have WIDTH and HEIGHT specified.) |
---|
2382 | # |
---|
2383 | my $crop_chance = 0.2; |
---|
2384 | if ($iw > $img_width * 0.4 || $ih > $img_height * 0.4) { |
---|
2385 | $crop_chance += 0.2; |
---|
2386 | } |
---|
2387 | if ($iw > $img_width * 0.7 || $ih > $img_height * 0.7) { |
---|
2388 | $crop_chance += 0.2; |
---|
2389 | } |
---|
2390 | if ($min_ratio && ($iw * $min_ratio) > $ih) { |
---|
2391 | $crop_chance += 0.7; |
---|
2392 | } |
---|
2393 | |
---|
2394 | if ($crop_chance > 0.1) { |
---|
2395 | LOG ($verbose_pbm, "crop chance: $crop_chance"); |
---|
2396 | } |
---|
2397 | |
---|
2398 | if (rand() < $crop_chance) { |
---|
2399 | |
---|
2400 | my $ow = $crop_w; |
---|
2401 | my $oh = $crop_h; |
---|
2402 | |
---|
2403 | if ($crop_w > $min_width) { |
---|
2404 | # if it's a banner, select the width linearly. |
---|
2405 | # otherwise, select a bell. |
---|
2406 | my $r = (($min_ratio && ($iw * $min_ratio) > $ih) |
---|
2407 | ? rand() |
---|
2408 | : bellrand()); |
---|
2409 | $crop_w = $min_width + int ($r * ($crop_w - $min_width)); |
---|
2410 | $crop_x = int (rand() * ($ow - $crop_w)); |
---|
2411 | } |
---|
2412 | if ($crop_h > $min_height) { |
---|
2413 | # height always selects as a bell. |
---|
2414 | $crop_h = $min_height + int (bellrand() * ($crop_h - $min_height)); |
---|
2415 | $crop_y = int (rand() * ($oh - $crop_h)); |
---|
2416 | } |
---|
2417 | |
---|
2418 | if ($crop_x != 0 || $crop_y != 0 || |
---|
2419 | $crop_w != $iw || $crop_h != $ih) { |
---|
2420 | LOG ($verbose_pbm, |
---|
2421 | "randomly cropping to ${crop_w}x$crop_h \@ $crop_x,$crop_y"); |
---|
2422 | } |
---|
2423 | } |
---|
2424 | |
---|
2425 | # Where the image should logically land -- this might be negative. |
---|
2426 | # |
---|
2427 | my $x = int((rand() * ($img_width + $crop_w/2)) - $crop_w*3/4); |
---|
2428 | my $y = int((rand() * ($img_height + $crop_h/2)) - $crop_h*3/4); |
---|
2429 | |
---|
2430 | # if we have chosen to paste the image outside of the rectangle of the |
---|
2431 | # screen, then we need to crop it. |
---|
2432 | # |
---|
2433 | if ($x < 0 || |
---|
2434 | $y < 0 || |
---|
2435 | $x + $crop_w > $img_width || |
---|
2436 | $y + $crop_h > $img_height) { |
---|
2437 | |
---|
2438 | LOG ($verbose_pbm, |
---|
2439 | "cropping for effective paste of ${crop_w}x$crop_h \@ $x,$y"); |
---|
2440 | |
---|
2441 | if ($x < 0) { $crop_x -= $x; $crop_w += $x; $x = 0; } |
---|
2442 | if ($y < 0) { $crop_y -= $y; $crop_h += $y; $y = 0; } |
---|
2443 | |
---|
2444 | if ($x + $crop_w >= $img_width) { $crop_w = $img_width - $x - 1; } |
---|
2445 | if ($y + $crop_h >= $img_height) { $crop_h = $img_height - $y - 1; } |
---|
2446 | } |
---|
2447 | |
---|
2448 | # If any cropping needs to happen, add pnmcut. |
---|
2449 | # |
---|
2450 | if ($crop_x != 0 || $crop_y != 0 || |
---|
2451 | $crop_w != $iw || $crop_h != $ih) { |
---|
2452 | $iw = $crop_w; |
---|
2453 | $ih = $crop_h; |
---|
2454 | $cmd .= " | pnmcut $crop_x $crop_y $iw $ih"; |
---|
2455 | LOG ($verbose_pbm, "cropping to ${crop_w}x$crop_h \@ $crop_x,$crop_y"); |
---|
2456 | } |
---|
2457 | |
---|
2458 | LOG ($verbose_pbm, "pasting ${iw}x$ih \@ $x,$y in $image_ppm"); |
---|
2459 | |
---|
2460 | $cmd .= " | pnmpaste - $x $y $image_ppm"; |
---|
2461 | |
---|
2462 | $cmd =~ s@^ *\| *@@; |
---|
2463 | |
---|
2464 | if (defined ($webcollage_helper)) { |
---|
2465 | $cmd = "$webcollage_helper $image_tmp1 $image_ppm " . |
---|
2466 | "$scale $opacity " . |
---|
2467 | "$crop_x $crop_y $x $y " . |
---|
2468 | "$iw $ih"; |
---|
2469 | $_ = $cmd; |
---|
2470 | |
---|
2471 | } else { |
---|
2472 | # use a PPM pipeline |
---|
2473 | $_ = "($cmd)"; |
---|
2474 | $_ .= " < $image_tmp1 > $image_tmp2"; |
---|
2475 | } |
---|
2476 | |
---|
2477 | if ($verbose_pbm) { |
---|
2478 | $_ = "($_) 2>&1 | sed s'/^/" . blurb() . "/'"; |
---|
2479 | } else { |
---|
2480 | $_ .= " 2> /dev/null"; |
---|
2481 | } |
---|
2482 | |
---|
2483 | my $rc = nontrapping_system ($_); |
---|
2484 | |
---|
2485 | if (defined ($webcollage_helper) && -z $image_ppm) { |
---|
2486 | LOG (1, "failed command: \"$cmd\""); |
---|
2487 | print STDERR "\naudit log:\n\n\n"; |
---|
2488 | print STDERR ("#" x 78) . "\n"; |
---|
2489 | print STDERR blurb() . "$image_ppm has zero size\n"; |
---|
2490 | showlog(); |
---|
2491 | print STDERR "\n\n"; |
---|
2492 | exit (1); |
---|
2493 | } |
---|
2494 | |
---|
2495 | if ($rc != 0) { |
---|
2496 | LOG (($verbose_pbm || $verbose_load), "failed command: \"$cmd\""); |
---|
2497 | LOG (($verbose_pbm || $verbose_load), "failed URL: \"$img\" (${ow}x$oh)"); |
---|
2498 | return; |
---|
2499 | } |
---|
2500 | |
---|
2501 | if (!defined ($webcollage_helper)) { |
---|
2502 | rename ($image_tmp2, $image_ppm) || return; |
---|
2503 | } |
---|
2504 | |
---|
2505 | my $target = "$image_ppm"; |
---|
2506 | |
---|
2507 | # don't just tack this onto the end of the pipeline -- we don't want it |
---|
2508 | # to end up in $image_ppm, because we don't want the results to be |
---|
2509 | # cumulative. |
---|
2510 | # |
---|
2511 | if ($post_filter_cmd) { |
---|
2512 | |
---|
2513 | my $cmd; |
---|
2514 | |
---|
2515 | $target = $image_tmp1; |
---|
2516 | if (!defined ($webcollage_helper)) { |
---|
2517 | $cmd = "($post_filter_cmd) < $image_ppm > $target"; |
---|
2518 | } else { |
---|
2519 | # Blah, my scripts need the JPEG data, but some other folks need |
---|
2520 | # the PPM data -- what to do? Ignore the problem, that's what! |
---|
2521 | # $cmd = "djpeg < $image_ppm | ($post_filter_cmd) > $target"; |
---|
2522 | $cmd = "($post_filter_cmd) < $image_ppm > $target"; |
---|
2523 | } |
---|
2524 | |
---|
2525 | $rc = nontrapping_system ($cmd); |
---|
2526 | if ($rc != 0) { |
---|
2527 | LOG ($verbose_pbm, "filter failed: \"$post_filter_cmd\"\n"); |
---|
2528 | return; |
---|
2529 | } |
---|
2530 | } |
---|
2531 | |
---|
2532 | if (!$no_output_p) { |
---|
2533 | my $tsize = (stat($target))[7]; |
---|
2534 | if ($tsize > 200) { |
---|
2535 | $cmd = "$ppm_to_root_window_cmd $target"; |
---|
2536 | |
---|
2537 | # xv seems to hate being killed. it tends to forget to clean |
---|
2538 | # up after itself, and leaves windows around and colors allocated. |
---|
2539 | # I had this same problem with vidwhacker, and I'm not entirely |
---|
2540 | # sure what I did to fix it. But, let's try this: launch xv |
---|
2541 | # in the background, so that killing this process doesn't kill it. |
---|
2542 | # it will die of its own accord soon enough. So this means we |
---|
2543 | # start pumping bits to the root window in parallel with starting |
---|
2544 | # the next network retrieval, which is probably a better thing |
---|
2545 | # to do anyway. |
---|
2546 | # |
---|
2547 | $cmd .= " &"; |
---|
2548 | |
---|
2549 | $rc = nontrapping_system ($cmd); |
---|
2550 | |
---|
2551 | if ($rc != 0) { |
---|
2552 | LOG (($verbose_pbm || $verbose_load), "display failed: \"$cmd\""); |
---|
2553 | return; |
---|
2554 | } |
---|
2555 | |
---|
2556 | } else { |
---|
2557 | LOG ($verbose_pbm, "$target size is $tsize"); |
---|
2558 | } |
---|
2559 | } |
---|
2560 | |
---|
2561 | $source .= "-" . stats_of($source); |
---|
2562 | print STDOUT "image: ${iw}x${ih} @ $x,$y $base $source\n" |
---|
2563 | if ($verbose_imgmap); |
---|
2564 | |
---|
2565 | clearlog(); |
---|
2566 | |
---|
2567 | return 1; |
---|
2568 | } |
---|
2569 | |
---|
2570 | |
---|
2571 | sub init_signals { |
---|
2572 | |
---|
2573 | $SIG{HUP} = \&signal_cleanup; |
---|
2574 | $SIG{INT} = \&signal_cleanup; |
---|
2575 | $SIG{QUIT} = \&signal_cleanup; |
---|
2576 | $SIG{ABRT} = \&signal_cleanup; |
---|
2577 | $SIG{KILL} = \&signal_cleanup; |
---|
2578 | $SIG{TERM} = \&signal_cleanup; |
---|
2579 | |
---|
2580 | # Need this so that if giftopnm dies, we don't die. |
---|
2581 | $SIG{PIPE} = 'IGNORE'; |
---|
2582 | } |
---|
2583 | |
---|
2584 | END { signal_cleanup(); } |
---|
2585 | |
---|
2586 | |
---|
2587 | sub main { |
---|
2588 | $| = 1; |
---|
2589 | srand(time ^ $$); |
---|
2590 | |
---|
2591 | my $verbose = 0; |
---|
2592 | my $dict; |
---|
2593 | my $driftnet_cmd = 0; |
---|
2594 | |
---|
2595 | $current_state = "init"; |
---|
2596 | $load_method = "none"; |
---|
2597 | |
---|
2598 | my $root_p = 0; |
---|
2599 | |
---|
2600 | # historical suckage: the environment variable name is lower case. |
---|
2601 | $http_proxy = $ENV{http_proxy} || $ENV{HTTP_PROXY}; |
---|
2602 | |
---|
2603 | while ($_ = $ARGV[0]) { |
---|
2604 | shift @ARGV; |
---|
2605 | if ($_ eq "-display" || |
---|
2606 | $_ eq "-displ" || |
---|
2607 | $_ eq "-disp" || |
---|
2608 | $_ eq "-dis" || |
---|
2609 | $_ eq "-dpy" || |
---|
2610 | $_ eq "-d") { |
---|
2611 | $ENV{DISPLAY} = shift @ARGV; |
---|
2612 | } elsif ($_ eq "-root") { |
---|
2613 | $root_p = 1; |
---|
2614 | } elsif ($_ eq "-no-output") { |
---|
2615 | $no_output_p = 1; |
---|
2616 | } elsif ($_ eq "-urls-only") { |
---|
2617 | $urls_only_p = 1; |
---|
2618 | $no_output_p = 1; |
---|
2619 | } elsif ($_ eq "-verbose") { |
---|
2620 | $verbose++; |
---|
2621 | } elsif (m/^-v+$/) { |
---|
2622 | $verbose += length($_)-1; |
---|
2623 | } elsif ($_ eq "-delay") { |
---|
2624 | $delay = shift @ARGV; |
---|
2625 | } elsif ($_ eq "-timeout") { |
---|
2626 | $http_timeout = shift @ARGV; |
---|
2627 | } elsif ($_ eq "-filter") { |
---|
2628 | $filter_cmd = shift @ARGV; |
---|
2629 | } elsif ($_ eq "-filter2") { |
---|
2630 | $post_filter_cmd = shift @ARGV; |
---|
2631 | } elsif ($_ eq "-background" || $_ eq "-bg") { |
---|
2632 | $background = shift @ARGV; |
---|
2633 | } elsif ($_ eq "-size") { |
---|
2634 | $_ = shift @ARGV; |
---|
2635 | if (m@^(\d+)x(\d+)$@) { |
---|
2636 | $img_width = $1; |
---|
2637 | $img_height = $2; |
---|
2638 | } else { |
---|
2639 | error "argument to \"-size\" must be of the form \"640x400\""; |
---|
2640 | } |
---|
2641 | } elsif ($_ eq "-proxy" || $_ eq "-http-proxy") { |
---|
2642 | $http_proxy = shift @ARGV; |
---|
2643 | } elsif ($_ eq "-dictionary" || $_ eq "-dict") { |
---|
2644 | $dict = shift @ARGV; |
---|
2645 | } elsif ($_ eq "-driftnet" || $_ eq "--driftnet") { |
---|
2646 | @search_methods = ( 100, "driftnet", \&pick_from_driftnet ); |
---|
2647 | if (! ($ARGV[0] =~ m/^-/)) { |
---|
2648 | $driftnet_cmd = shift @ARGV; |
---|
2649 | } else { |
---|
2650 | $driftnet_cmd = $default_driftnet_cmd; |
---|
2651 | } |
---|
2652 | } elsif ($_ eq "-debug" || $_ eq "--debug") { |
---|
2653 | my $which = shift @ARGV; |
---|
2654 | my @rest = @search_methods; |
---|
2655 | my $ok = 0; |
---|
2656 | while (@rest) { |
---|
2657 | my $pct = shift @rest; |
---|
2658 | my $name = shift @rest; |
---|
2659 | my $tfn = shift @rest; |
---|
2660 | |
---|
2661 | if ($name eq $which) { |
---|
2662 | @search_methods = (100, $name, $tfn); |
---|
2663 | $ok = 1; |
---|
2664 | last; |
---|
2665 | } |
---|
2666 | } |
---|
2667 | error "no such search method as \"$which\"" unless ($ok); |
---|
2668 | LOG (1, "DEBUG: using only \"$which\""); |
---|
2669 | |
---|
2670 | } else { |
---|
2671 | print STDERR "$copyright\nusage: $progname " . |
---|
2672 | "[-root] [-display dpy] [-verbose] [-debug which]\n" . |
---|
2673 | "\t\t [-timeout secs] [-delay secs] [-filter cmd] [-filter2 cmd]\n" . |
---|
2674 | "\t\t [-no-output] [-urls-only] [-background color] [-size WxH]\n" . |
---|
2675 | "\t\t [-dictionary dictionary-file] [-http-proxy host[:port]]\n" . |
---|
2676 | "\t\t [-driftnet [driftnet-program-and-args]]\n" . |
---|
2677 | "\n"; |
---|
2678 | exit 1; |
---|
2679 | } |
---|
2680 | } |
---|
2681 | |
---|
2682 | if ($http_proxy && $http_proxy eq "") { |
---|
2683 | $http_proxy = undef; |
---|
2684 | } |
---|
2685 | if ($http_proxy && $http_proxy =~ m@^http://([^/]*)/?$@ ) { |
---|
2686 | # historical suckage: allow "http://host:port" as well as "host:port". |
---|
2687 | $http_proxy = $1; |
---|
2688 | } |
---|
2689 | |
---|
2690 | if (!$root_p && !$no_output_p) { |
---|
2691 | print STDERR $copyright; |
---|
2692 | error "the -root argument is mandatory (for now.)"; |
---|
2693 | } |
---|
2694 | |
---|
2695 | if (!$no_output_p && !$ENV{DISPLAY}) { |
---|
2696 | error "\$DISPLAY is not set."; |
---|
2697 | } |
---|
2698 | |
---|
2699 | |
---|
2700 | if ($verbose == 1) { |
---|
2701 | $verbose_imgmap = 1; |
---|
2702 | $verbose_warnings = 1; |
---|
2703 | |
---|
2704 | } elsif ($verbose == 2) { |
---|
2705 | $verbose_imgmap = 1; |
---|
2706 | $verbose_warnings = 1; |
---|
2707 | $verbose_load = 1; |
---|
2708 | |
---|
2709 | } elsif ($verbose == 3) { |
---|
2710 | $verbose_imgmap = 1; |
---|
2711 | $verbose_warnings = 1; |
---|
2712 | $verbose_load = 1; |
---|
2713 | $verbose_filter = 1; |
---|
2714 | |
---|
2715 | } elsif ($verbose == 4) { |
---|
2716 | $verbose_imgmap = 1; |
---|
2717 | $verbose_warnings = 1; |
---|
2718 | $verbose_load = 1; |
---|
2719 | $verbose_filter = 1; |
---|
2720 | $verbose_net = 1; |
---|
2721 | |
---|
2722 | } elsif ($verbose == 5) { |
---|
2723 | $verbose_imgmap = 1; |
---|
2724 | $verbose_warnings = 1; |
---|
2725 | $verbose_load = 1; |
---|
2726 | $verbose_filter = 1; |
---|
2727 | $verbose_net = 1; |
---|
2728 | $verbose_pbm = 1; |
---|
2729 | |
---|
2730 | } elsif ($verbose == 6) { |
---|
2731 | $verbose_imgmap = 1; |
---|
2732 | $verbose_warnings = 1; |
---|
2733 | $verbose_load = 1; |
---|
2734 | $verbose_filter = 1; |
---|
2735 | $verbose_net = 1; |
---|
2736 | $verbose_pbm = 1; |
---|
2737 | $verbose_http = 1; |
---|
2738 | |
---|
2739 | } elsif ($verbose >= 7) { |
---|
2740 | $verbose_imgmap = 1; |
---|
2741 | $verbose_warnings = 1; |
---|
2742 | $verbose_load = 1; |
---|
2743 | $verbose_filter = 1; |
---|
2744 | $verbose_net = 1; |
---|
2745 | $verbose_pbm = 1; |
---|
2746 | $verbose_http = 1; |
---|
2747 | $verbose_exec = 1; |
---|
2748 | } |
---|
2749 | |
---|
2750 | if ($dict) { |
---|
2751 | error ("$dict does not exist") unless (-f $dict); |
---|
2752 | $wordlist = $dict; |
---|
2753 | } else { |
---|
2754 | pick_dictionary(); |
---|
2755 | } |
---|
2756 | |
---|
2757 | init_signals(); |
---|
2758 | |
---|
2759 | spawn_driftnet ($driftnet_cmd) if ($driftnet_cmd); |
---|
2760 | |
---|
2761 | if ($urls_only_p) { |
---|
2762 | url_only_output; |
---|
2763 | } else { |
---|
2764 | x_or_pbm_output; |
---|
2765 | } |
---|
2766 | } |
---|
2767 | |
---|
2768 | main; |
---|
2769 | exit (0); |
---|