#!/usr/bin/perl use Date::Parse; use Date::Format; @lines = <>; print "\n\n". "\n". "\n". " Slashdot\n". " http://slashdot.org/\n". " News for nerds, stuff that matters\n". " en\n". " Copyright 1997-2004, OSDN - Open Source Development Network,Inc. All Rights Reserved.\n". " 2004-06-01T12:13:52+00:00\n". " OSDN\n". " pater\@slashdot.org\n". " Technology\n". " hourly\n". " 1\n". " 1970-01-01T00:00+00:00\n". " \n". " \n". "\n\n". "\n". "Slashdot\n". "http://images.slashdot.org/topics/topicslashdot.gif\n". "http://slashdot.org/\n". "\n\n"; foreach (@lines) { chomp; $count++; if ($_ eq "\tWIDTH=\"13\" HEIGHT=\"16\" ALT=\"\" ALIGN=\"TOP\">/) { /FONT COLOR=\"#FFFFFF\">(.*)<\/FONT><\/A>(.*)<\/B>/; $headline = $1.$2 } else { /(.*)<\/B>/; $headline = $1 } $section = 'articles'; if (/HREF="\/\/([^\.]+).slashdot.org\/"/) { $section = $1; } $found++; next; } if (($found > 1) && ($found < 21)) { # Extract additional metadata if (m/topics\/topic([^\.]+)\.gif/) { $subject = $1; } elsif (m/^([^<]+)<\/A>$/) { $creator = $1; } elsif (m/from the (.+) dept\.<\/B>/) { $dept = $1; } elsif (m/on \w+ (\w+ \d{1,2}), @([0-9:]{5}(?:AM|PM))<\/B>/) { # Only works for the default timezone and display! $date = str2time("$1 $2", "est"); $date = time2str("%Y-%m-%dT%H:%M:%S+00:00", $date, "gmt"); } elsif (m/^<\/TD><\/TR><\/TABLE>/ && !$subject) { # Early end to topic images; increment found to compensate $found += 9; } $found++; next; } if ($found == 21) { # In theory we've now hit the line with description and link $main = $_; $found++; $insidemain = 1; next; } if ($found > 20) { if (m/Read More...<\/B>/i) { $link = "http:".(split(/\"/,$_))[1]; $link = (split(/&/,$link))[0]; $found = 0; $main =~ s/HREF="\/\//HREF="http:\/\//g; $main =~ s/&/&/g; $main =~ s//>/g; $main =~ s/\"/"/g; $main =~ s/\'/'/g; $main =~ s/\r/\n/g; $headline =~ s/&/&/g; print "\n". "$headline\n". "$link\n". "$main\n". "$creator\n". "$subject\n". "$date\n". "$dept\n". "$section\n". "\n\n"; # These should be emptied because they should not be carried over # to the next article, where they may stay empty. $link = ''; $headline = ''; $main = ''; $creator = ''; $subject = ''; $date = ''; $dept = ''; $section = ''; } elsif ($insidemain) { if (m/

/) { $insidemain = 0; } else { s/<\/?i>//g; $main .= $_; } } } } print "\n". "Search Slashdot\n". "Search Slashdot stories\n". "query\n". "http://slashdot.org/search.pl\n". "\n". "\n";