UN Secretary-General Twitter Source Code

Here is the source code that generates http://twitter.com/secgen; more information.

#!/usr/bin/env python
#
# secgen.py:
# Twitter the daily schedule of the UN Secretary-General
#
# Copyright (c) 2007 Matthew Somerville. All rights reserved.
# http://www.dracos.co.uk/

import optparse
import os
import re
import sys
import textwrap
import urllib
import htmlentitydefs
from datetime import datetime, timedelta
from time import strptime, sleep
from BeautifulSoup import BeautifulSoup
import tweepy
from secgenconfig import *

localfile = '/guest/matthew/data/secgen-schedule'

def main():
p = optparse.OptionParser(version="UN Secretary-General > Twitter v1.0") choices = [ 'fetch', 'twitter', 'test' ] p.add_option('--action', type='choice', choices=choices, help='Action to perform; one of %s' % ', '.join(choices) ) options, args = p.parse_args() if options.action == 'fetch': if fetch(): test() elif options.action == 'twitter': now = datetime.today() for time, event in parse(): if now>=time and now<time+timedelta(minutes=5): twitter(event) elif options.action == 'test': test() else: p.print_help()
def test():
#import codecs #sys.stdout = codecs.getwriter('utf8')(sys.stdout) for time, event in parse(warn=1): print time, event
def fetch():
new = get_contents('http://www.un.org/sg/schedule.shtml') current = '' try: current = get_contents(localfile) except: pass if current != new and not re.search('Not Found|Service Temporarily Unavailable(?i)', new): f = open(localfile, 'w') f.write(new) f.close() try: os.remove('%s-override' % localfile) except: pass print "New schedule downloaded" return True return False
def parse(warn=0):
try: d = get_contents("%s-override" % localfile) except: d = get_contents(localfile) if re.search('Proxy Error', d): if warn: print 'Have downloaded a proxy error...' return [] soup = BeautifulSoup(d, smartQuotesTo=None) table = soup('body')[1].find('table') events = [] pastnoon = False for row in table('tr'): cells = row('td') time = parsecell(cells[0]) if not time and len(cells)<3: continue last = None if len(cells)==3 and not time: last = parsecell(cells[2]) elif len(cells)==2: if not parsecell(cells[1]): # Two cell column, second empty, can't be time/event last = time elif len(cells)==1: last = time if last: last = re.sub('\xc2\xa0', ' ', last) last = re.sub('APPOINTMENTS OF THE SECRETARY-GENERAL', '', last) last = re.sub('\[?All (other )?appointments are internal\.?\]?(?i)', '', last) last = re.sub('\(scroll down for Appointments.*', '', last) last = re.sub(' \(Subject to change\)| *Subject to Change| Please note that this schedule is subject to change|( |\*)?REV\.? ?[12]\*?(?i)', '', last) last = re.sub('\s+$', '', last) last = re.sub('^\s+', '', last) if re.search('Back to Spokes|on an official trip|on official travel|is travelling$|will be visiting|on official visit(?i)', last): last = '' if last == '': continue date = last try: date = strptime(date, '%A, %d %B %Y') except: try: date = strptime(date, '%d %B %Y') except: try: date = strptime(date, '%A %d %B %Y') except: if warn: print "AARGH - *%s*" % date sys.exit() # Bomb out if we can't get a date continue if time == '' or time == '.': continue time, pastnoon = parsetime(time, date, pastnoon) if len(cells)==2: event = parsecell(cells[1], True) else: event = parsecell(cells[2], True) event = prettify(event) events.append((time, event)) return events
def twitter(s):
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) api = tweepy.API(auth) if len(s)>140: wrapped = textwrap.wrap(s, 137) else: wrapped = [ s ] resp = '' first = True for line in wrapped: if resp: sleep(5) if first and len(wrapped)>1: line = "%s..." % line if not first: line = "...%s" % line resp += api.update_status(line).text first = False return resp
def parsetime(time, date, pastnoon):
m = re.search('(\d+)(?:(?::|\.)\s*(\d+))?(?:\s+(a|p|noon))?', time) if m: (hour, min, pm) = m.groups() if min == None: min = 0 elif time == 'noon': hour = 12 min = 0 pm = 'noon' hour = int(hour) min = int(min) if not pm and pastnoon: hour += 12 if pm == 'p' and hour != 12: hour += 12 if pm == 'a' and hour == 12: hour -= 12 if pm == 'p' or pm == 'noon': pastnoon = True d = datetime(date.tm_year, date.tm_mon, date.tm_mday, hour, min) d += timedelta(hours=5) # Assume we're in New York, and BST is same (which it isn't) return d, pastnoon
def prettify(s):
if re.match('Addressing|Meeting (with|on)|Visiting|Visit to|Press Conference', s): return s if re.match('Joint press encounter by the Secretary-General with: ', s): return re.sub('Joint press encounter by the Secretary-General with: ', 'Joint press encounter with ', s) if re.match('Joint Declaration on (.*?) by the Secretary-General and ', s): return re.sub('Joint (.*?) by the Secretary-General and ', r'Joint \1 with ', s) if re.match('Secretary-General[^a-zA-Z]*to address ', s): return re.sub('Secretary-General[^a-zA-Z]*to address ', 'Addressing ', s) if re.match('Secretary-General to make ', s): return re.sub('Secretary-General to make ', 'Making ', s) if re.match('Secretary-General to attend ', s): return re.sub('Secretary-General to attend ', 'Attending ', s) if re.match('Secretary-General to brief ', s): return re.sub('Secretary-General to brief ', 'Briefing ', s) if re.match('Secretary-General&rsquo;s briefing to ', s): return re.sub('Secretary-General&rsquo;s briefing to ', 'Briefing to ', s) if re.match('Secretary-General to speak at ', s): return re.sub('Secretary-General to speak at ', 'Speaking at ', s) if re.match('Secretary-General to speak to ', s): return re.sub('Secretary-General to speak to ', 'Speaking to ', s) if re.match('Secretary-General\'s opening statement at ', s): return re.sub('Secretary-General\'s opening statement at his ', 'Making opening statement at my ', s) if re.match('Secretary-General\'s closing statement at ', s): return re.sub('Secretary-General\'s closing statement at his ', 'Making closing statement at my ', s) if re.match('Secretary-General to deliver ', s): return re.sub('Secretary-General to deliver ', 'Delivering ', s) if re.match('Remarks by the Secretary-General |SG remarks at|Secretary-General (to give )?remarks at', s): return re.sub('Remarks by the Secretary-General |SG remarks |Secretary-General (to give )?remarks ', 'Making remarks ', s) if re.search(' .\200\223 Secretary-General to make remarks$', s): return re.sub('^(.*) .\200\223 Secretary-General to make remarks', r'Making remarks at the \1', s) if re.match('\[Remarks at\] ', s): return re.sub('\[Remarks at\] ', 'Making remarks at ', s) if re.search('Presentation of credential(?i)', s) or re.match('Remarks at', s) or re.match('Election of', s) or re.match('Swearing in Ceremony', s): pass elif re.search('^Chairmen|^Permanent Representative|^Executive Secretaries|Board members|Permanent Representatives|Envoys|Team$|^Honou?rable|Interns', s) and not re.search('(concert|luncheon|breakfast)(?i)', s): s = 'Meeting the %s' % s elif re.match('President|Association of|Vuk|Prince|Major-General|His Excellency|His Eminence|His Holiness|His Majesty|Ambassador|HE|H\.R\.H|H\.M\.|H\.H\.|H\.E\.|Rev\.|Sir|General (?!Assembly)|H\.S\.H|Mr\.|Mrs\.|Prof\.|Dr\.|Professor|Ms\.|Amb\.?|Mayor|Messrs\.|Senator|(The )?R(igh)?t\.? Hon(ou?rable)?\.?|Hon\.|U\.S\. House|U\.S\. Senator|US Congressman|Judge|Archbishop|The Honorable|Rabbi|Lt\.|Major General', s) and not re.search('luncheon(?i)', s): s = re.sub('Amb\.', 'Ambassador', s) s = re.sub('^Amb ', 'Ambassador ', s) s = 'Meeting %s' % s elif re.search('Secretary-General of the League|Senior Adviser|Special Adviser|Permanent Representative|Special Representative|Minister of|Secretary of State for|Administrator|CEO|National Adviser|Ambassador|students', s) and not re.search('(concert|luncheon|breakfast|hosted by)(?i)', s): s = 'Meeting %s' % s elif re.match('The ', s): s = re.sub('^The ', 'Attending the ', s) else: s = 'Attending the %s' % s return s
def parsecell(s, d=False):
s = s.renderContents() s = re.sub('\xc2\xa0', ' ', s) if d: s = re.sub("<br />", ", ", s) s = re.sub("</p>", " ", s) s = re.sub("<[^>]*>", "", s) s = re.sub("&nbsp;", "", s) s = re.sub("\s+", " ", s) s = s.strip(" ;") s = unescape(s) return s
def get_contents(s):
if 'http://' in s: f = urllib.urlopen(s) else: f = open(s) o = f.read() f.close() return o
def unescape(text):
def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text)
main()

Navigation