%s

#!/usr/bin/python # Licence: GPL 2 (c) Marc MERLIN """Extracts a blog from an archive directory of blogger posts, using a pattern (perma string or perma unix time (old format)), and generates a cleaned up web page which removes traces to the real blog's URL The primary use for this script is to give a link to a blog post to people who you don't want to give you entire blog to Usage: http://urlto/blogsnippet?perma=blogname|permatime A web page is generated on STDOUT (this is meant to be run as a CGI) """ __author__ = 'marc_soft@merlins.org' import sys import re import glob import os import extract_blog_entry import cgi import cgitb VERBOSE = 0 BLOGDIR = "/home/merlin/blog" # CHANGEME BLOG_ITEMS_PAT = BLOGDIR+"/[0-9][0-9][0-9][0-9]/[0-9][0-9]" # In my blog template, I have and match for # > sys.stderr, msg sys.exit(255) def Usage(code, msg=''): if code: fd = sys.stderr else: fd = sys.stdout print >> fd, msg+"\n" print >> fd, (__doc__) sys.exit(code) # We create a closure so that we can pass the Verbose function to our object # (the idea being of course to pass the verbose level along) def MakeVerbose(mast_level=1): def Verbose(msg, level=1): if level <= mast_level: print >> sys.stderr, msg + "
" return(Verbose) # takes pattern [outputfile] (default to stdout) def main(): # Will be shared between this module, and the extract_blog_entry class Verbose = MakeVerbose(VERBOSE) # Get ready for HTML output, even if text mode, in case perma was missing print "Content-Type: text/html; charset=UTF-8\n" # get pattern from command line for debugging, or from the http get/post try: pattern = sys.argv[1] # if no argv is received, we assume to be running as a CGI except IndexError: # We want to see stderr in the browser too sys.stderr = sys.stdout # output tracebacks in an HTML friendly fashion cgitb.enable() f = cgi.FieldStorage() pattern = f.getfirst("perma") if pattern == None: die("No patttern received") # Separate numerical blog perma patterns from filenames # note that matching blog posts by the old unix time perma date doesn't work # on new blogs unless your template contains old permas somewhere. # Adjust BLOGUNIXTIMEPERMPAT as necessary if re.search(r'^\d+$', pattern): # clean up pattern, just in case pattern=re.sub("([^a-zA-Z0-9]*)", "", pattern) Verbose("Pattern %s is unix time" % pattern) # yes, yes, I'm forking a shell command, sue me :) # (much shorter than writing the equivalent in python) f = os.popen("grep '%s%s[^0-9]' %s/*.html" % (BLOGUNIXTIMEPERMPAT, pattern, BLOG_ITEMS_PAT)) try: filename = f.readlines()[0] # strip end for the grep text and newline filename = (re.sub(':.*', '', filename))[0:-1] except IndexError: die("Couldn't find pattern " + pattern) else: # Remove an optional trailing .html in the pattern looked for pattern = re.sub(r'\.html$', '', pattern) Verbose("Pattern %s is filename" % pattern) # even though it's unconditionally re-added here filename = glob.glob(BLOG_ITEMS_PAT + "/" + pattern + ".html") if len(filename) == 0: die("No blog match for "+pattern) elif len(filename) > 1: die("bug: %d matches for %s" % (len(filename), pattern)) filename = filename[0] Verbose("found blog in " + filename) # let the default handlers catch errors here file = "".join(open(filename).readlines()) # fix html that the sgml parser can't handle file=re.sub(re.compile("
", re.IGNORECASE), "
", file) html = extract_blog_entry.ScrapeHtml(pattern, Verbose, True, 10) html.feed(file) print ''' ''' print "%s" % html.blog_title print "\n".join(html.style_capture) print '''

''' print "\n".join(html.capture) print '''

''' if __name__ == '__main__': main()

Blog Entry