#!/usr/bin/python2 # -*- coding: ISO-8859-1 -*- # Licence: GPL 2 (c) Marc MERLIN """Class that inherits from SGMLParser to extract a blog entry from an HTML archive page generated by blogger """ __author__ = 'marc_soft@merlins.org' import re import sgmllib # Extend the regex from SGMLParser to support high bit characters like # UTF-8 accents sgmllib.attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9Ą-˙./,:;+*%?!&$_#=~\'"@]*))?') # This could be made a configuration option if really needed TITLE_ANCHOR_MARKER_START = "

" TITLE_ANCHOR_MARKER_END = "

" STYLE_CAPTURE_START = '' class ScrapeHtml(sgmllib.SGMLParser): ''' Takes an HTML archive page from blogger, and returns parts of its content. Input: anchorname is the pattern we are looking for (blog name or unix time) Verbose is a function pointer that displays messages if loglevel matches comments=true will also capture the comments that people left in a blog Output: capture: blog entry (parsed for output) style_capture: return the css stylesheet if needed to show the captured html blog_title: returns the title given in the blog entry (parsed from the html) ''' def __init__(self, anchorname, Verbose, comments=False, parser_verbose=0): # PRIVATE VARS # The entire list of tokens that we parse (currently unused) self._tokens = [] # keep track of whether we're currently capturing the content of the blog self._do_capture = False # keep track of whether we're currently capturing the content of style self._stylecapture = False # keep track of a closing we need to remove self._remove_next_closinga = False # keep track of the closing to add our closing a name in front of self._close_next_h = False # find out whether we're inside a comment capture zone self._inside_comment = False # missing from the top class, although we don't actually really care # since we capture unknown entities too self.entitydefs['nbsp'] = ' ' self.Verbose = Verbose # we'll add an A NAME to allow for bookmarks in the middle of the blog self._aname = '' # currently unused self._anchorname = anchorname # do we want to capture other people's comments to your blog? self._capture_comments = comments # PUBLIC VARS # The actual blog content self.capture = [] # The content between the style tags self.style_capture = [] # We parse the blog data and capture the title self.blog_title = "Untitled" sgmllib.SGMLParser.__init__(self, parser_verbose) # overwrite the default class function as a callback (hence lowercase name) # all other callbacks also get fed back to data for capture factoring def add_data(self, data): # skip effectively empty lines if re.search(r'^\s+$', data): return # store the whole html in there, in case we wanted it self._tokens += [ data ] # if we are in the portion to extract: if self._do_capture: self.Verbose("Capturing %s (eat_a: %s|inside_comment: %s|capture: %s|" "style capture: %s)" % (data, self._remove_next_closinga, self._inside_comment, self._do_capture, self._stylecapture), 5) self.capture += [ data ] elif self._stylecapture == True: self.Verbose("Capturing stylesheet: %s" % data, 5) self.style_capture += [ data ] # overwrite the default class function as a callback (hence lowercase name) def handle_data(self, data): self.add_data(data) # are we within the h markers for the blog's title? if self._close_next_h: self.blog_title = data # overwrite the default class function as a callback (hence lowercase name) def handle_comment(self, data): data = "" % data self.add_data(data) if data == "": self._inside_comment = True if not self._capture_comments: self._do_capture = False self.Verbose("stopped capture on %s (skip comments)" % data) # In case we were still capturing, stop now if data == "": if self._do_capture: self._do_capture = False self._inside_comment = False self.Verbose("stopped capture on %s (include comments)" % data) if data == "": self._do_capture = True self.Verbose("started capture on " + data) # overwrite the default class function as a callback (hence lowercase name) def unknown_starttag(self, tag, attrs): if not attrs: data = '<' + tag + '>' else: data = '<' + tag for name, value in attrs: data += " " + name + '=' + '"' + value + '"' data += '>' self.Verbose("reconstructed " + data, 3) # if we are in the portion to extract: if self._do_capture: if data == TITLE_ANCHOR_MARKER_START: self.Verbose("Found %s within capture area, adding anchor %s" % \ (data, self._aname), 1) self._close_next_h = True data += "\n" + self._aname # otherwise, look for extra tags we care about elif data == STYLE_CAPTURE_START: self.Verbose("Found %s within capture area, starting style capture" % \ data, 1) self._stylecapture = True self.Verbose("Start tag %s (eat_a: %s|inside_comment: %s|capture: %s|" "style capture: %s)" % (data, self._remove_next_closinga, self._inside_comment, self._do_capture, self._stylecapture), 5) # decide what to do with anchors, munge them, delete them or pass them if (re.search(r'a .*href', data, re.IGNORECASE)): # Deal with different kinds of a anchors do not capture full URLs that # point back to the real blog or blogsnippet links (useless, this is the # current page) if re.search(r'a href.* title="permanent link"', data) or \ re.search(r'a href.*cgi-bin/blogsnippet', data): self.Verbose("removed URL anchor: " + data, 2) self._remove_next_closinga = True self._tokens += [ data ] return # kill any URLs within a comment area (too many would point somewhere we # don't want) if self._inside_comment: self.Verbose("removed URL anchor inside comment: " + data, 2) self._remove_next_closinga = True self._tokens += [ data ] return # if we get an archives link, we can mangle it to a blogsnippet link utime = re.search(r'a href="archives/.*#(\d+)', data, re.IGNORECASE) if utime: utime = utime.groups() data='' % utime[0] self.Verbose("Changed archive a href to: |" + data + "|", 1) # mangle labels label = re.search(r'', data, re.IGNORECASE) if label: label = label.groups() data='' % label[0] self.Verbose("Changed label a href to: |" + data + "|", 1) # if we got that far, we have an approved a anchor, or another start tag # process it like everything else self.add_data(data) # overwrite the default class function as a callback (hence lowercase name) def unknown_endtag(self, tag): data = '' self.Verbose("Parsing closing tag %s" % data, 5) if self._close_next_h: if data == TITLE_ANCHOR_MARKER_END: self.Verbose("Found %s within capture area, adding closing anchor "\ % data, 1) self._close_next_h = False data += "\n" elif data == STYLE_CAPTURE_END: self.Verbose("Found %s within capture area, ending style capture" % \ data, 1) self._stylecapture = False self.style_capture += [ data ] elif data.lower() == "" and self._remove_next_closinga: self.Verbose("removed end of URL anchor: " + data, 2) self._remove_next_closinga = False self._tokens += [ data ] return else: self.Verbose("Got other unhandled closing tag %s" % data, 3) self.add_data(data) # overwrite the default class function as a callback (hence lowercase name) def unknown_entityref(self, ref): data = '&' + ref + ';' self.add_data(data) # overwrite the default class function as a callback (hence lowercase name) def unknown_charref(self, ref): data = '&#' + ref + ';' self.add_data(data) # overwrite the default class function as a callback (hence lowercase name) def unknown_decl(self, data): self.Verbose("unknown declaration " + data) self.add_data(data) # overwrite the default class function as a callback (hence lowercase name) def close(self): sgmllib.SGMLParser.close(self) if __name__ == '__main__': print """ This is a python module that extracts a blog entry from an HTML page It's not meant to be called from the command line"""