source: trunk/third/xscreensaver/hacks/ljlatest @ 20148

Revision 20148, 12.8 KB checked in by ghudson, 21 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r20147, which included commits to RCS files with non-trunk default branches.
  • Property svn:executable set to *
Line 
1#!/usr/bin/perl -w
2# Copyright © 2003 Jamie Zawinski <jwz@jwz.org>
3#
4# Permission to use, copy, modify, distribute, and sell this software and its
5# documentation for any purpose is hereby granted without fee, provided that
6# the above copyright notice appear in all copies and that both that
7# copyright notice and this permission notice appear in supporting
8# documentation.  No representations are made about the suitability of this
9# software for any purpose.  It is provided "as is" without express or
10# implied warranty.
11#
12# Created: 30-Aug-2003.
13#
14# Spits out the text of the most recent public posts on livejournal.com.
15# This works as the "-program" argument to phosphor, starwars, etc.
16
17require 5;
18use strict;
19
20# We can't "use diagnostics" here, because that library malfunctions if
21# you signal and catch alarms: it says "Uncaught exception from user code"
22# and exits, even though I damned well AM catching it!
23#use diagnostics;
24
25use Socket;
26use Text::Wrap qw(wrap);
27use bytes;  # Larry can take Unicode and shove it up his ass sideways.
28
29my $progname = $0; $progname =~ s@.*/@@g;
30my $version = q{ $Revision: 1.1.1.1 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/;
31
32my $verbose = 0;
33
34my $url = "http://www.livejournal.com/stats/latest-rss.bml";
35
36my $http_proxy = undef;
37my $http_timeout = 30;
38my $http_timeout2 = 5;
39
40
41# Maps HTML character entities to the corresponding Latin1 characters.
42#
43my %entity_table = (
44   "quot"   => '"', "amp"    => '&', "lt"     => '<', "gt"     => '>',
45   "nbsp"   => ' ', "iexcl"  => '¡', "cent"   => '¢', "pound"  => '£',
46   "curren" => '¤', "yen"    => '¥', "brvbar" => '¦', "sect"   => '§',
47   "uml"    => '¨', "copy"   => '©', "ordf"   => 'ª', "laquo"  => '«',
48   "not"    => '¬', "shy"    => '­', "reg"    => '®', "macr"   => '¯',
49   "deg"    => '°', "plusmn" => '±', "sup2"   => '²', "sup3"   => '³',
50   "acute"  => '´', "micro"  => 'µ', "para"   => '¶', "middot" => '·',
51   "cedil"  => '¸', "sup1"   => '¹', "ordm"   => 'º', "raquo"  => '»',
52   "frac14" => '¼', "frac12" => '½', "frac34" => '¾', "iquest" => '¿',
53   "Agrave" => 'À', "Aacute" => 'Á', "Acirc"  => 'Â', "Atilde" => 'Ã',
54   "Auml"   => 'Ä', "Aring"  => 'Å', "AElig"  => 'Æ', "Ccedil" => 'Ç',
55   "Egrave" => 'È', "Eacute" => 'É', "Ecirc"  => 'Ê', "Euml"   => 'Ë',
56   "Igrave" => 'Ì', "Iacute" => 'Í', "Icirc"  => 'Î', "Iuml"   => 'Ï',
57   "ETH"    => 'Ð', "Ntilde" => 'Ñ', "Ograve" => 'Ò', "Oacute" => 'Ó',
58   "Ocirc"  => 'Ô', "Otilde" => 'Õ', "Ouml"   => 'Ö', "times"  => '×',
59   "Oslash" => 'Ø', "Ugrave" => 'Ù', "Uacute" => 'Ú', "Ucirc"  => 'Û',
60   "Uuml"   => 'Ü', "Yacute" => 'Ý', "THORN"  => 'Þ', "szlig"  => 'ß',
61   "agrave" => 'à', "aacute" => 'á', "acirc"  => 'â', "atilde" => 'ã',
62   "auml"   => 'ä', "aring"  => 'å', "aelig"  => 'æ', "ccedil" => 'ç',
63   "egrave" => 'è', "eacute" => 'é', "ecirc"  => 'ê', "euml"   => 'ë',
64   "igrave" => 'ì', "iacute" => 'í', "icirc"  => 'î', "iuml"   => 'ï',
65   "eth"    => 'ð', "ntilde" => 'ñ', "ograve" => 'ò', "oacute" => 'ó',
66   "ocirc"  => 'ô', "otilde" => 'õ', "ouml"   => 'ö', "divide" => '÷',
67   "oslash" => 'ø', "ugrave" => 'ù', "uacute" => 'ú', "ucirc"  => 'û',
68   "uuml"   => 'ü', "yacute" => 'ý', "thorn"  => 'þ', "yuml"   => 'ÿ',
69   "apos"   => '\''
70);
71
72# Maps certain UTF8 characters (2 or 3 bytes) to the corresponding
73# Latin1 characters.
74#
75my %unicode_latin1_table = (
76   "\xC2\xA1" => '¡', "\xC2\xA2" => '¢', "\xC2\xA3" => '£', "\xC2\xA4" => '¤',
77   "\xC2\xA5" => '¥', "\xC2\xA6" => '¦', "\xC2\xA7" => '§', "\xC2\xA8" => '¨',
78   "\xC2\xA9" => '©', "\xC2\xAA" => 'ª', "\xC2\xAB" => '«', "\xC2\xAC" => '¬',
79   "\xC2\xAD" => '­', "\xC2\xAE" => '®', "\xC2\xAF" => '¯', "\xC2\xB0" => '°',
80   "\xC2\xB1" => '±', "\xC2\xB2" => '²', "\xC2\xB3" => '³', "\xC2\xB4" => '´',
81   "\xC2\xB5" => 'µ', "\xC2\xB6" => '¶', "\xC2\xB7" => '·', "\xC2\xB8" => '¸',
82   "\xC2\xB9" => '¹', "\xC2\xBA" => 'º', "\xC2\xBB" => '»', "\xC2\xBC" => '¼',
83   "\xC2\xBD" => '½', "\xC2\xBE" => '¾', "\xC2\xBF" => '¿', "\xC3\x80" => 'À',
84   "\xC3\x81" => 'Á', "\xC3\x82" => 'Â', "\xC3\x83" => 'Ã', "\xC3\x84" => 'Ä',
85   "\xC3\x85" => 'Å', "\xC3\x86" => 'Æ', "\xC3\x87" => 'Ç', "\xC3\x88" => 'È',
86   "\xC3\x89" => 'É', "\xC3\x8A" => 'Ê', "\xC3\x8B" => 'Ë', "\xC3\x8C" => 'Ì',
87   "\xC3\x8D" => 'Í', "\xC3\x8E" => 'Î', "\xC3\x8F" => 'Ï', "\xC3\x90" => 'Ð',
88   "\xC3\x91" => 'Ñ', "\xC3\x92" => 'Ò', "\xC3\x93" => 'Ó', "\xC3\x94" => 'Ô',
89   "\xC3\x95" => 'Õ', "\xC3\x96" => 'Ö', "\xC3\x97" => '×', "\xC3\x98" => 'Ø',
90   "\xC3\x99" => 'Ù', "\xC3\x9A" => 'Ú', "\xC3\x9B" => 'Û', "\xC3\x9C" => 'Ü',
91   "\xC3\x9D" => 'Ý', "\xC3\x9E" => 'Þ', "\xC3\x9F" => 'ß', "\xC3\xA0" => 'à',
92   "\xC3\xA1" => 'á', "\xC3\xA2" => 'â', "\xC3\xA3" => 'ã', "\xC3\xA4" => 'ä',
93   "\xC3\xA5" => 'å', "\xC3\xA6" => 'æ', "\xC3\xA7" => 'ç', "\xC3\xA8" => 'è',
94   "\xC3\xA9" => 'é', "\xC3\xAA" => 'ê', "\xC3\xAB" => 'ë', "\xC3\xAC" => 'ì',
95   "\xC3\xAD" => 'í', "\xC3\xAE" => 'î', "\xC3\xAF" => 'ï', "\xC3\xB0" => 'ð',
96   "\xC3\xB1" => 'ñ', "\xC3\xB2" => 'ò', "\xC3\xB3" => 'ó', "\xC3\xB4" => 'ô',
97   "\xC3\xB5" => 'õ', "\xC3\xB6" => 'ö', "\xC3\xB7" => '÷', "\xC3\xB8" => 'ø',
98   "\xC3\xB9" => 'ù', "\xC3\xBA" => 'ú', "\xC3\xBB" => 'û', "\xC3\xBC" => 'ü',
99   "\xC3\xBD" => 'ý', "\xC3\xBE" => 'þ', "\xC3\xBF" => 'ÿ',
100
101   "\xE2\x80\x93" => '--',  "\xE2\x80\x94" => '--',
102   "\xE2\x80\x98" => '`',   "\xE2\x80\x99" => '\'',
103   "\xE2\x80\x9C" => "``",  "\xE2\x80\x9D" => "''",
104   "\xE2\x80\xA6" => '...',
105);
106
107
108# Convert any HTML entities to Latin1 characters.
109#
110sub de_entify {
111  my ($text) = @_;
112  $text =~ s/(&(\#)?([[:alpha:]\d]+);?)/
113    {
114     my $c;
115     if ($2) {
116       $c = chr($3);  # the &#number is always decimal, right?
117     } else {
118       $c = $entity_table{$3};
119     }
120#    print STDERR "$progname: warning: unknown HTML character entity \"$1\"\n"
121#     unless $c;
122     ($c ? $c : "[$3]");
123    }
124   /gexi;
125  return $text;
126}
127
128
129# Convert any Unicode characters to Latin1 if possible.
130# Unconvertable bytes are left alone.
131#
132sub de_unicoddle {
133  my ($text) = @_;
134  foreach my $key (keys (%unicode_latin1_table)) {
135    my $val = $unicode_latin1_table{$key};
136    $text =~ s/$key/$val/gs;
137  }
138  return $text;
139}
140
141
142# returns three values: the HTTP response line; the document headers;
143# and the document body.
144#
145sub get_document {
146  my ( $url ) = @_;
147
148  my $timeout  = $http_timeout;
149  my $timeout2 = $http_timeout2;
150
151  print STDERR "$progname: loading $url\n" if ($verbose);
152
153  if (! ($url =~ m@^http://@i)) {
154    error ("not an HTTP URL: $url");
155  }
156
157  my ($url_proto, $dummy, $serverstring, $path) = split(/\//, $url, 4);
158  $path = "" unless $path;
159
160  my ($them,$port) = split(/:/, $serverstring);
161  $port = 80 unless $port;
162
163  my $them2 = $them;
164  my $port2 = $port;
165  if ($http_proxy) {
166    $serverstring = $http_proxy if $http_proxy;
167    $serverstring =~ s@^[a-z]+://@@;
168    ($them2,$port2) = split(/:/, $serverstring);
169    $port2 = 80 unless $port2;
170  }
171
172  my ($remote, $iaddr, $paddr, $proto, $line);
173  $remote = $them2;
174  if ($port2 =~ /\D/) { $port2 = getservbyname($port2, 'tcp') }
175  if (!$port2) {
176    error ("unrecognised port in $url");
177  }
178  $iaddr   = inet_aton($remote);
179  if (!$iaddr) {
180    error ("host not found: $remote");
181  }
182  $paddr   = sockaddr_in($port2, $iaddr);
183
184
185  my $head = "";
186  my $body = "";
187
188  @_ =
189    eval {
190      local $SIG{ALRM} = sub {
191        if ($body ne '') {
192          print STDERR "$progname: timed out ($timeout) in headers for $url\n";
193        } else {
194          print STDERR "$progname: timed out ($timeout2) in body for $url\n";
195        }
196        die "alarm\n";
197      };
198      alarm $timeout;
199
200      $proto   = getprotobyname('tcp');
201      if (!socket(S, PF_INET, SOCK_STREAM, $proto)) {
202        error ("socket: $!");
203      }
204      if (!connect(S, $paddr)) {
205        error ("connect($serverstring): $!");
206      }
207
208      select(S); $| = 1; select(STDOUT);
209
210      my $user_agent = "$progname/$version";
211
212      my $hdrs = "GET " . ($http_proxy ? $url : "/$path") . " HTTP/1.0\r\n" .
213                 "Host: $them\r\n" .
214                 "User-Agent: $user_agent\r\n";
215      $hdrs .= "\r\n";
216
217      if ($verbose > 1) {
218        foreach (split('\r?\n', $hdrs)) {
219          print STDERR "  ==> $_\n";
220        }
221      }
222
223      print S $hdrs;
224      my $http = <S> || "";
225
226      $_  = $http;
227      s/[\r\n]+$//s;
228      print STDERR "  <== $_\n" if ($verbose > 1);
229
230      while (<S>) {
231        $head .= $_;
232        s/[\r\n]+$//s;
233        last if m@^$@;
234        print STDERR "  <== $_\n" if ($verbose > 1);
235      }
236
237      my $lines = 0;
238      while (<S>) {
239        $body .= $_;
240        $lines++;
241
242        # we wait $timeout secs to get the first body line; after
243        # that, we time out if we haven't received a subsequent line
244        # in $timeout2 seconds.
245        #
246        alarm $timeout2;
247      }
248
249      print STDERR ("  <== [ body ]: $lines lines, " .
250                    length($body) . " bytes\n")
251        if ($verbose > 1);
252
253      close S;
254
255      if (!$http) {
256        print STDERR "$progname: null response: $url\n" if ($verbose);
257      }
258
259      return ( $http, $head, $body );
260    };
261  die if ($@ && $@ ne "alarm\n");       # propagate errors
262  if ($@) {
263    # timed out
264    return ();
265  } else {
266    # didn't
267    alarm 0;
268    return @_;
269  }
270}
271
272
273sub lj_latest {
274  my ($images_p, $count, $cols) = @_;
275
276  $|=1;  # unbuffer stdout
277
278  $_ = $url;
279  s@^[a-z]+:/+([^/?\#]+).*$@$1@;
280  my $host = $_;
281
282  print STDOUT "Contacting $host..." if ($verbose);
283
284  my ($http, $head, $body) = get_document ($url);
285
286  if (!$body) {
287    print STDOUT "$progname: no response from $host\n";
288    return;
289  }
290
291  print STDOUT "\n\n" if ($verbose);
292
293#  $body = `cat /tmp/last`;
294#  if (1) {
295#    local *OUT;
296#    open OUT, ">/tmp/last";
297#    print OUT "$http\n$head\n$body\n";
298#    close OUT;
299#  }
300
301  $body =~ s/(<item\b)/\001\001$1/gsi;
302  my $i = 0;
303  foreach (split (/\001\001/, $body)) {
304    next unless m/^<item\b/i;
305    last if (defined ($count) && $i >= $count);
306
307    my ($ig0, $title) = m@<(TITLE       [^<>\s]*)[^<>]*>\s*(.*?)\s*</\1>@xsi;
308    my ($ig1, $body)  = m@<(DESCRIPTION [^<>\s]*)[^<>]*>\s*(.*?)\s*</\1>@xsi;
309    my ($ig2, $url)   = m@<(LINK        [^<>\s]*)[^<>]*>\s*(.*?)\s*</\1>@xsi;
310
311    $_ = "$title\n\n$body";
312
313    s@<[^<>]*>@@gs;                 # lose all XML tags
314    $_ = de_unicoddle ($_);         # convert UTF8 to Latin1
315    $_ = de_entify ($_);            # convert entities to get HTML from XML
316
317    if ($images_p) {
318      s/</\001\001</gs;
319      foreach (split (/\001\001/, $_)) {
320        next unless m/^(<img\b[^<>]+>)/i;
321        $_ = $1;
322        my ($src) = m/\bSRC    \s*=\s*[\"\']?([^<>\"\'\s]+)/xsi;
323        next unless ($src);
324        next if ($src =~ m@^http://[^./]+\.livejournal\.com\b@); # builtins
325
326        my ($w)   = m/\bWIDTH  \s*=\s*[\"\']?(\d+)/xsi;
327        my ($h)   = m/\bHEIGHT \s*=\s*[\"\']?(\d+)/xsi;
328
329        $_ = "<A HREF=\"$url\"><IMG SRC=\"$src\"";
330        $_ .= " WIDTH=$w" if ($w);
331        $_ .= " HEIGHT=$h" if ($h);
332        $_ .= " BORDER=1 HSPACE=4 VSPACE=4></A><BR>\n";
333        print STDOUT $_;
334        $i++;
335      }
336
337    } else {  # emit text/plain
338
339      s@</?(BR|TR|TD|LI|DIV)\b[^<>]*>@\n@gsi; # line break at BR, TD, DIV, etc
340      s@</?(P|UL|OL|BLOCKQUOTE)\b[^<>]*>@\n\n@gsi; # two line breaks
341
342      s@<lj\s+user=\"?([^<>\"]+)\"?[^<>]*>?@$1@gsi;  # handle <LJ USER=>
343      s@</?[BI]>@*@gsi;               # bold, italic => asterisks
344
345      s@<[^<>]*>?@@gs;                # lose all other HTML tags
346      $_ = de_entify ($_);            # convert entities in the html too
347
348      # elide any remaining non-Latin1 binary data...
349      s/([\177-\377]+(\s*[\177-\377]+)[^a-z\d]*)/«...» /g;
350      #s/([\177-\377]+(\s*[\177-\377]+)[^a-z\d]*)/«$1» /g;
351
352      $_ .= "\n";
353
354      s/[ \t]*$//gm;                  # lose whitespace at end of line
355      s@\n\n\n+@\n\n@gs;              # compress blank lines
356
357      $Text::Wrap::columns = $cols;
358      $_ = wrap ("", "  ", $_);       # wrap the lines as a paragraph
359
360      s/[ \t]*$//gm;                  # lose whitespace at end of line again
361      s/^\s+//s;                    # de-indent first line
362      $_ .= "\n";                   # blank line at very end
363      print STDOUT $_;
364      $i++;
365    }
366  }
367}
368
369
370sub error {
371  ($_) = @_;
372  print STDERR "$progname: $_\n";
373  exit 1;
374}
375
376sub usage {
377  print STDERR "usage: $progname [--verbose] [--count N] [--columns N]" .
378    " [--images] [rss-url]\n";
379  exit 1;
380}
381
382sub main {
383  my $count = undef;
384  my $images_p = 0;
385  my $cols = 72;
386  while ($_ = $ARGV[0]) {
387    shift @ARGV;
388    if ($_ eq "--verbose") { $verbose++; }
389    elsif (m/^-v+$/) { $verbose += length($_)-1; }
390    elsif ($_ eq "--count") { $count = 0 + shift @ARGV; }
391    elsif ($_ eq "--images") { $images_p = 1; }
392    elsif ($_ eq "--columns" ||
393           $_ eq "--column" ||
394           $_ eq "--cols" ||
395           $_ eq "--col") {
396      $cols = 0 + shift @ARGV; }
397    elsif (m/^-./) { usage; }
398    elsif (m@^http://@) { $url = $_; }
399    else { usage; }
400  }
401
402  # historical suckage: the environment variable name is lower case.
403  $http_proxy = $ENV{http_proxy} || $ENV{HTTP_PROXY};
404
405  lj_latest ($images_p, $count, $cols);
406}
407
408main;
409exit 0;
Note: See TracBrowser for help on using the repository browser.