#!/usr/bin/perl # # ripdiaries.pl -- suck down diaries from k5 or other scoop site # # the first time you run this, it will retrieve the 50 most recent diary # entries. after that, it will only download entries created since last time. # this also filters out ads, font tags, and most of the tables, which should # make it more usable on PDAs. # # creates index.html and a bunch of numbered ones. you should run this script # in an empty directory. the following command should produce a ready-to-sync # .pdb: # # plucker-build -f diaries -M 2 -H index.html # # this program is in the public domain. you can do whatever with it, but i'd # appreciate credit and preferably a link to quadium.net. # # check http://quadium.net/code/ for updates # # -- vsync@quadium.net $site = "http://www.kuro5hin.org"; $commentmode = "nested"; $runtime = localtime(); if (open(LASTREAD, "$ENV{HOME}/.lastdiary")) { $lastdiary = ; chomp $lastdiary; close LASTREAD; } else { $lastdiary = "shiny new user"; } YANKSIDS: while (++$page) { open(INDEX, "lynx -source $site/?op=search\\&type=diary\\&count=50\\&next_page=$page\\&next=y|"); while () { if (m!(.*).*>(.*)\'s\ Diary!i) { $sid = $1; last YANKSIDS if ($lastdiary eq $sid); $title = $2; $author = $3; =~ /on\ (.*)/; $dateline = $1; push @diaries, [$sid, $title, $author, $dateline]; } } close INDEX; last if ($lastdiary eq "shiny new user"); } open(INDEX, ">index.html"); print INDEX "\n\nDiaries Ripped at $runtime\n\n"; print INDEX "

Diaries Ripped at $runtime

\n"; print INDEX "

\n"; while ($entry = pop @diaries) { ($filename = $sid = $entry->[0]) =~ tr!/!.!; print INDEX "
$entry->[2] - $entry->[1]\n"; print INDEX "
$entry->[3]\n"; open(RAWDIARY, "lynx -source -dump $site/?op=displaystory\\;sid=$sid\\;commentmode=$commentmode|"); open(STRIPPED, ">$filename.html"); $ignore = 0; while () { chomp; s/\r//; s/]*>//g; s/<\/FONT>//g; if (index($_, "") != -1 || index($_, "") != -1 || index($_, "") != -1 || index($_, "") != -1 || index($_, "") != -1) { $ignore = 1; } if (index($_, "") != -1 || index($_, "") != -1 || index($_, "") != -1 || index($_, "") != -1 || index($_, "") != -1) { $ignore = 0; } if (!$ignore) { print STRIPPED "$_\n"; } } close STRIPPED; close RAWDIARY; } print INDEX "

\n


"; print INDEX "
Generated by ripdiaries.pl, available at\n"; print INDEX "quadium.net.
\n"; print INDEX "\n"; close INDEX; open(LASTREAD, ">$ENV{HOME}/.lastdiary"); print LASTREAD "$sid\n"; print LASTREAD "$lastdiary\n"; close LASTREAD;