#!/usr/bin/env python # # pnu2rss by Pieter Edelman # This script converts the Physics News Update (http://www.aip.org/pnu/) to an # RSS feed. This update appears about once a week, so please choose your # update interval wisely. # # ============================================================================ # WARNING: THIS IS A STAND-ALONE SCRIPT, NOT A FILTER!! # The reason for this is that the PNU website only mentions when a new issue # has appeared. For contents an extra link should be followed. # It works with the PNU issues from 2004 onwards. # ============================================================================ # # Copyright (c) 2004 Pieter Edelman # Released under the terms of the GNU General Public License (GPL) Version 2. # See http://www.gnu.org/ for details. import urllib, re, sys # The matches used in this script: blockquote = re.compile(""); whitespace = re.compile("^\s*"); eol = re.compile("\r\n"); paragraph = re.compile("

"); tags = re.compile("<(.*?)>"); ampersand = re.compile("&"); headline = re.compile("p class=\"headline\">(.*)

"); # Print the header print ""; print ""; print ""; print ""; print " Physics News Update"; print " http://www.aip.org/pnu/"; print " The AIP Bulletin of Physics News"; print " "; print " http://www.aip.org/pnu/images/pnulogo.gif"; print " "; # First, get the link to the newest issue from the frontpage try: # Open the channel webpage = urllib.urlopen("http://www.aip.org/pnu/"); # Keep reading until the issue is found line = webpage.readline(); while (line): match = re.search("td bgcolor=\"#ffff99\" colspan=\"2\">Update #\d*\s*.*", line); if (match): # If the line is found, construct the link, close the channel, and skip out of the loop issue = "http://www.aip.org" + match.groups()[0]; webpage.close(); break; line = webpage.readline(); except: sys.exit(); # Fetch the issue page try: webpage = urllib.urlopen(issue); except: sys.exit(); # Keep track of the number of items within a page counter = 0; # Start reading the page line = webpage.readline(); while (line): # Search for headlines match = re.search(headline, line); if (match): # Save the title title = match.groups()[0]; # Construct the link to the particular item counter = counter + 1; link = re.sub("(.*)/(.*)\.html", r"\1/split/\2-" + str(counter) + ".html", issue); # Skip over the
statement webpage.readline(); # Append lines to the description until
is found description = ""; while (line): line = webpage.readline(); if (re.search(blockquote, line)): break; line = re.sub(whitespace, "", line); line = re.sub(eol, "", line); line = re.sub(paragraph, " \n\n

", line); line = re.sub(ampersand, r"&", line); line = re.sub(tags, r"<\1>", line); description += line; # Print out the item print " "; print " " + title + ""; print " " + link + ""; print " " + description; print " "; print " "; # Read the next line line = webpage.readline(); # Close the channel webpage.close(); # Print the footer print " "; print "";