#!/usr/bin/python

# Licence: GPL 2 (c) Marc MERLIN

"""Extracts a blog from an archive directory of blogger posts, using a pattern
(perma string or perma unix time (old format)), and generates a cleaned up web 
page which removes traces to the real blog's URL
The primary use for this script is to give a link to a blog post to people who
you don't want to give you entire blog to

Usage:
  http://urlto/blogsnippet?perma=blogname|permatime

  A web page is generated on STDOUT (this is meant to be run as a CGI)
"""

__author__ = 'marc_soft@merlins.org'

import sys
import re
import glob
import os
import extract_blog_entry
import cgi
import cgitb

VERBOSE = 0
BLOGDIR = "/home/merlin/blog" # CHANGEME
BLOG_ITEMS_PAT = BLOGDIR+"/[0-9][0-9][0-9][0-9]/[0-9][0-9]"

# In my blog template, I have and match for
# <A HREF="/cgi-bin/blogsnippet?perma=113199272368300001>
# Adjust this pattern to your template if your blogs also contain unix times
BLOGUNIXTIMEPERMPAT = 'perma='	# CHANGEME

def die(msg=""):
  print >> sys.stderr, msg
  sys.exit(255)

def Usage(code, msg=''):
  if code:
    fd = sys.stderr
  else:
    fd = sys.stdout
  print >> fd, msg+"\n"
  print >> fd, (__doc__)
  sys.exit(code)

# We create a closure so that we can pass the Verbose function to our object
# (the idea being of course to pass the verbose level along)
def MakeVerbose(mast_level=1):
  def Verbose(msg, level=1):
    if level <= mast_level:
      print >> sys.stderr, msg + "<BR>"
  return(Verbose)


# takes pattern [outputfile] (default to stdout)
def main():
  # Will be shared between this module, and the extract_blog_entry class
  Verbose = MakeVerbose(VERBOSE)

  # Get ready for HTML output, even if text mode, in case perma was missing
  print "Content-Type: text/html; charset=UTF-8\n"

  # get pattern from command line for debugging, or from the http get/post
  try:
    pattern = sys.argv[1]
  # if no argv is received, we assume to be running as a CGI
  except IndexError:
    # We want to see stderr in the browser too
    sys.stderr = sys.stdout
    # output tracebacks in an HTML friendly fashion
    cgitb.enable()

    f = cgi.FieldStorage()
    pattern = f.getfirst("perma")

  if pattern == None:
    die("No patttern received")

  # Separate numerical blog perma patterns from filenames
  # note that matching blog posts by the old unix time perma date doesn't work
  # on new blogs unless your template contains old permas somewhere. 
  # Adjust BLOGUNIXTIMEPERMPAT as necessary
  if re.search(r'^\d+$', pattern):
    # clean up pattern, just in case
    pattern=re.sub("([^a-zA-Z0-9]*)", "", pattern)
    Verbose("Pattern %s is unix time" % pattern)

    # yes, yes, I'm forking a shell command, sue me :)
    # (much shorter than writing the equivalent in python)
    f = os.popen("grep '%s%s[^0-9]' %s/*.html" % (BLOGUNIXTIMEPERMPAT, pattern, BLOG_ITEMS_PAT))
    try:
      filename = f.readlines()[0]
      # strip end for the grep text and newline
      filename = (re.sub(':.*', '', filename))[0:-1]
    except IndexError:
      die("Couldn't find pattern " + pattern)
  else:
    # Remove an optional trailing .html in the pattern looked for
    pattern = re.sub(r'\.html$', '', pattern)
    Verbose("Pattern %s is filename" % pattern)
    # even though it's unconditionally re-added here
    filename = glob.glob(BLOG_ITEMS_PAT + "/" + pattern + ".html")
    if len(filename) == 0:
      die("No blog match for "+pattern)
    elif len(filename) > 1:
      die("bug: %d matches for %s" % (len(filename), pattern))
    filename = filename[0]

  Verbose("found blog in " + filename)

  # let the default handlers catch errors here
  file = "".join(open(filename).readlines())

  # fix html that the sgml parser can't handle
  file=re.sub(re.compile("<br */>", re.IGNORECASE), "<br>", file)
  
  html = extract_blog_entry.ScrapeHtml(pattern, Verbose, True, 10)
  html.feed(file)
  print '''
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
'''
  print "<title>%s</title>" % html.blog_title
  print "\n".join(html.style_capture)
  print '''
</head>
<body>

<!-- Heading -->
<div id="header">
<h1>Blog Entry</h1>
</div>

<!-- Main Column -->
<div id="mainClm">

<!-- Blog Posts -->
'''
  print "\n".join(html.capture)
  print '''
</div>
</body>
</html>
'''

if __name__ == '__main__':
  main()
