#!/usr/bin/python2
# -*- coding: ISO-8859-1 -*-

# Licence: GPL 2 (c) Marc MERLIN

"""Class that inherits from SGMLParser to extract a blog entry from an HTML
archive page generated by blogger 
"""

__author__ = 'marc_soft@merlins.org'

import re
import sgmllib

# Extend the regex from SGMLParser to support high bit characters like
# UTF-8 accents
sgmllib.attrfind = re.compile(
    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
    r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9¡-ÿ./,:;+*%?!&$\(\)_#=~\'"@]*))?')

# This could be made a configuration option if really needed
TITLE_ANCHOR_MARKER_START = "<h2>"
TITLE_ANCHOR_MARKER_END = "</h2>"
STYLE_CAPTURE_START = '<style type="text/css">'
STYLE_CAPTURE_END = '</style>'

class ScrapeHtml(sgmllib.SGMLParser):
  ''' Takes an HTML archive page from blogger, and returns parts of its content.

  Input:
    anchorname is the pattern we are looking for (blog name or unix time)
    Verbose is a function pointer that displays messages if loglevel matches
    comments=true will also capture the comments that people left in a blog

  Output:
    capture: blog entry (parsed for output)
    style_capture: return the css stylesheet if needed to show the captured html
    blog_title: returns the title given in the blog entry (parsed from the html)
  '''

  def __init__(self, anchorname, Verbose, comments=False, parser_verbose=0):
    # PRIVATE VARS
    # The entire list of tokens that we parse (currently unused)
    self._tokens = []
    # keep track of whether we're currently capturing the content of the blog
    self._do_capture = False
    # keep track of whether we're currently capturing the content of style
    self._stylecapture = False
    # keep track of a closing </a> we need to remove
    self._remove_next_closinga = False
    # keep track of the closing </h> to add our closing a name </a> in front of
    self._close_next_h = False
    # find out whether we're inside a comment capture zone
    self._inside_comment = False
    # missing from the top class, although we don't actually really care
    # since we capture unknown entities too
    self.entitydefs['nbsp'] = ' '

    self.Verbose = Verbose
    # we'll add an A NAME to allow for bookmarks in the middle of the blog
    self._aname = '<a name="' + anchorname + '">'
    # currently unused
    self._anchorname = anchorname
    # do we want to capture other people's comments to your blog?
    self._capture_comments = comments

    # PUBLIC VARS
    # The actual blog content
    self.capture = []
    # The content between the style tags
    self.style_capture = []
    # We parse the blog data and capture the title
    self.blog_title = "Untitled"
    sgmllib.SGMLParser.__init__(self, parser_verbose)

  # overwrite the default class function as a callback (hence lowercase name)
  # all other callbacks also get fed back to data for capture factoring
  def add_data(self, data):
    # skip effectively empty lines
    if re.search(r'^\s+$', data):
      return

    # store the whole html in there, in case we wanted it
    self._tokens += [ data ]

    # if we are in the portion to extract:
    if self._do_capture:
      self.Verbose("Capturing %s (eat_a: %s|inside_comment: %s|capture: %s|"
		    "style capture: %s)" % 
		    (data, self._remove_next_closinga, self._inside_comment,
		     self._do_capture, self._stylecapture), 5)

      self.capture += [ data ]
    elif self._stylecapture == True:
      self.Verbose("Capturing stylesheet: %s" % data, 5)
      self.style_capture += [ data ]


  # overwrite the default class function as a callback (hence lowercase name)
  def handle_data(self, data):
    self.add_data(data)
    # are we within the h markers for the blog's title?
    if self._close_next_h:
      self.blog_title = data

  # overwrite the default class function as a callback (hence lowercase name)
  def handle_comment(self, data):
    data = "<!--%s-->" % data
    self.add_data(data)

    if data == "<!-- ItemPage -->":
      self._inside_comment = True
      if not self._capture_comments:
	self._do_capture = False
	self.Verbose("stopped capture on %s (skip comments)" % data)

    # In case we were still capturing, stop now
    if data == "<!-- /ItemPage -->":
      if self._do_capture:
	self._do_capture = False
	self._inside_comment = False
	self.Verbose("stopped capture on %s (include comments)" % data)

    if data == "<!-- Blog Posts -->":
      self._do_capture = True
      self.Verbose("started capture on " + data)

  # overwrite the default class function as a callback (hence lowercase name)
  def unknown_starttag(self, tag, attrs):
    if not attrs:
	data = '<' + tag + '>'
    else:
	data = '<' + tag
	for name, value in attrs:
	    data += " " + name + '=' + '"' + value + '"'
	data += '>'
    self.Verbose("reconstructed " + data, 3)

    # if we are in the portion to extract:
    if self._do_capture:
      if data == TITLE_ANCHOR_MARKER_START:
	self.Verbose("Found %s within capture area, adding anchor %s" % \
						  (data, self._aname), 1)
	self._close_next_h = True
	data += "\n" + self._aname
    # otherwise, look for extra tags we care about
    elif data == STYLE_CAPTURE_START:
      self.Verbose("Found %s within capture area, starting style capture" % \
								    data, 1)
      self._stylecapture = True

    self.Verbose("Start tag %s (eat_a: %s|inside_comment: %s|capture: %s|"
		  "style capture: %s)" % 
		  (data, self._remove_next_closinga, self._inside_comment,
		   self._do_capture, self._stylecapture), 5)

    # decide what to do with anchors, munge them, delete them or pass them
    if (re.search(r'a .*href', data, re.IGNORECASE)):
      # Deal with different kinds of a anchors do not capture full URLs that
      # point back to the real blog or blogsnippet links (useless, this is the
      # current page)
      if re.search(r'a href.* title="permanent link"', data) or \
	 re.search(r'a href.*cgi-bin/blogsnippet', data):
	self.Verbose("removed URL anchor: " + data, 2)
	self._remove_next_closinga = True
	self._tokens += [ data ]
	return

      # kill any URLs within a comment area (too many would point somewhere we
      # don't want)
      if self._inside_comment:
	self.Verbose("removed URL anchor inside comment: " + data, 2)
	self._remove_next_closinga = True
	self._tokens += [ data ]
	return

      # if we get an archives link, we can mangle it to a blogsnippet link
      utime = re.search(r'a href="archives/.*#(\d+)', data, re.IGNORECASE)
      if utime:
	utime = utime.groups()
	data='<a href="/cgi-bin/blogsnippet?perma=%s">' % utime[0]
	self.Verbose("Changed archive a href to: |" + data + "|", 1)

      # mangle labels
      label = re.search(r'<a.*href=".*/labels/(.+).html">', data, re.IGNORECASE)
      if label:
	label = label.groups()
	data='<a href="/perso/%s/">' % label[0]
	self.Verbose("Changed label a href to: |" + data + "|", 1)

    # if we got that far, we have an approved a anchor, or another start tag
    # process it like everything else
    self.add_data(data)


  # overwrite the default class function as a callback (hence lowercase name)
  def unknown_endtag(self, tag):
    data = '</' + tag + '>'

    self.Verbose("Parsing closing tag %s" % data, 5)
    if self._close_next_h:
      if data == TITLE_ANCHOR_MARKER_END:
	self.Verbose("Found %s within capture area, adding closing anchor </a>"\
			% data, 1)
	self._close_next_h = False
	data += "\n</a>"
    elif data == STYLE_CAPTURE_END:
      self.Verbose("Found %s within capture area, ending style capture" % \
								    data, 1)
      self._stylecapture = False
      self.style_capture += [ data ]
    elif data.lower() == "</a>" and self._remove_next_closinga:
      self.Verbose("removed end of URL anchor: " + data, 2)
      self._remove_next_closinga = False
      self._tokens += [ data ]
      return
    else:
      self.Verbose("Got other unhandled closing tag %s" % data, 3)
    self.add_data(data)

  # overwrite the default class function as a callback (hence lowercase name)
  def unknown_entityref(self, ref):
    data = '&' + ref + ';'
    self.add_data(data)

  # overwrite the default class function as a callback (hence lowercase name)
  def unknown_charref(self, ref):
    data = '&#' + ref + ';'
    self.add_data(data)

  # overwrite the default class function as a callback (hence lowercase name)
  def unknown_decl(self, data):
    self.Verbose("unknown declaration " + data)
    self.add_data(data)

  # overwrite the default class function as a callback (hence lowercase name)
  def close(self):
    sgmllib.SGMLParser.close(self)


if __name__ == '__main__':
  print """
This is a python module that extracts a blog entry from an HTML page
It's not meant to be called from the command line"""
