blob: 4216fe829c04a08e134568a3fc0bc3c6edfedd4d [file] [log] [blame]
# Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
# for details. All rights reserved. Use of this source code is governed by a
# BSD-style license that can be found in the LICENSE file.
#!/usr/bin/env python3
#
import re, base64, logging, pickle, httplib2, time, urlparse, urllib2, urllib, StringIO, gzip, zipfile
from google.appengine.ext import webapp, db
from google.appengine.api import taskqueue, urlfetch, memcache, images, users
from google.appengine.ext.webapp.util import login_required
from google.appengine.ext.webapp import template
from django.utils import simplejson as json
from django.utils.html import strip_tags
from oauth2client.appengine import CredentialsProperty
from oauth2client.client import OAuth2WebServerFlow
import encoder
# TODO(jimhug): Allow client to request desired thumb size.
THUMB_SIZE = (57, 57)
READER_API = 'http://www.google.com/reader/api/0'
MAX_SECTIONS = 5
MAX_ARTICLES = 20
class UserData(db.Model):
credentials = CredentialsProperty()
sections = db.ListProperty(db.Key)
def getEncodedData(self, articleKeys=None):
enc = encoder.Encoder()
# TODO(jimhug): Only return initially visible section in first reply.
maxSections = min(MAX_SECTIONS, len(self.sections))
enc.writeInt(maxSections)
for section in db.get(self.sections[:maxSections]):
section.encode(enc, articleKeys)
return enc.getRaw()
class Section(db.Model):
title = db.TextProperty()
feeds = db.ListProperty(db.Key)
def fixedTitle(self):
return self.title.split('_')[0]
def encode(self, enc, articleKeys=None):
# TODO(jimhug): Need to optimize format and support incremental updates.
enc.writeString(self.key().name())
enc.writeString(self.fixedTitle())
enc.writeInt(len(self.feeds))
for feed in db.get(self.feeds):
feed.ensureEncodedFeed()
enc.writeRaw(feed.encodedFeed3)
if articleKeys is not None:
articleKeys.extend(feed.topArticles)
class Feed(db.Model):
title = db.TextProperty()
iconUrl = db.TextProperty()
lastUpdated = db.IntegerProperty()
encodedFeed3 = db.TextProperty()
topArticles = db.ListProperty(db.Key)
def ensureEncodedFeed(self, force=False):
if force or self.encodedFeed3 is None:
enc = encoder.Encoder()
articleSet = []
self.encode(enc, MAX_ARTICLES, articleSet)
logging.info('articleSet length is %s' % len(articleSet))
self.topArticles = articleSet
self.encodedFeed3 = enc.getRaw()
self.put()
def encode(self, enc, maxArticles, articleSet):
enc.writeString(self.key().name())
enc.writeString(self.title)
enc.writeString(self.iconUrl)
logging.info('encoding feed: %s' % self.title)
encodedArts = []
for article in self.article_set.order('-date').fetch(limit=maxArticles):
encodedArts.append(article.encodeHeader())
articleSet.append(article.key())
enc.writeInt(len(encodedArts))
enc.writeRaw(''.join(encodedArts))
class Article(db.Model):
feed = db.ReferenceProperty(Feed)
title = db.TextProperty()
author = db.TextProperty()
content = db.TextProperty()
snippet = db.TextProperty()
thumbnail = db.BlobProperty()
thumbnailSize = db.TextProperty()
srcurl = db.TextProperty()
date = db.IntegerProperty()
def ensureThumbnail(self):
# If our desired thumbnail size has changed, regenerate it and cache.
if self.thumbnailSize != str(THUMB_SIZE):
self.thumbnail = makeThumbnail(self.content)
self.thumbnailSize = str(THUMB_SIZE)
self.put()
def encodeHeader(self):
# TODO(jmesserly): for now always unescape until the crawler catches up
enc = encoder.Encoder()
enc.writeString(self.key().name())
enc.writeString(unescape(self.title))
enc.writeString(self.srcurl)
enc.writeBool(self.thumbnail is not None)
enc.writeString(self.author)
enc.writeInt(self.date)
enc.writeString(unescape(self.snippet))
return enc.getRaw()
class HtmlFile(db.Model):
content = db.BlobProperty()
compressed = db.BooleanProperty()
filename = db.StringProperty()
author = db.UserProperty(auto_current_user=True)
date = db.DateTimeProperty(auto_now_add=True)
class UpdateHtml(webapp.RequestHandler):
def post(self):
upload_files = self.request.POST.multi.__dict__['_items']
version = self.request.get('version')
logging.info('files: %r' % upload_files)
for data in upload_files:
if data[0] != 'files': continue
file = data[1]
filename = file.filename
if version:
filename = '%s-%s' % (version, filename)
logging.info('upload: %r' % filename)
htmlFile = HtmlFile.get_or_insert(filename)
htmlFile.filename = filename
# If text > (1MB - 1KB) then gzip text to fit in 1MB space
text = file.value
if len(text) > 1024 * 1023:
data = StringIO.StringIO()
gz = gzip.GzipFile(str(filename), 'wb', fileobj=data)
gz.write(text)
gz.close()
htmlFile.content = data.getvalue()
htmlFile.compressed = True
else:
htmlFile.content = text
htmlFile.compressed = False
htmlFile.put()
self.redirect('/')
class TopHandler(webapp.RequestHandler):
@login_required
def get(self):
user = users.get_current_user()
prefs = UserData.get_by_key_name(user.user_id())
if prefs is None:
self.redirect('/update/user')
return
params = {'files': HtmlFile.all().order('-date').fetch(limit=30)}
self.response.out.write(template.render('top.html', params))
class MainHandler(webapp.RequestHandler):
@login_required
def get(self, name):
if name == 'dev':
return self.handleDev()
elif name == 'login':
return self.handleLogin()
elif name == 'upload':
return self.handleUpload()
user = users.get_current_user()
prefs = UserData.get_by_key_name(user.user_id())
if prefs is None:
return self.handleLogin()
html = HtmlFile.get_by_key_name(name)
if html is None:
self.error(404)
return
self.response.headers['Content-Type'] = 'text/html'
if html.compressed:
# TODO(jimhug): This slightly sucks ;-)
# Can we write directly to the response.out?
gz = gzip.GzipFile(
name, 'rb', fileobj=StringIO.StringIO(html.content))
self.response.out.write(gz.read())
gz.close()
else:
self.response.out.write(html.content)
# TODO(jimhug): Include first data packet with html.
def handleLogin(self):
user = users.get_current_user()
# TODO(jimhug): Manage secrets for dart.googleplex.com better.
# TODO(jimhug): Confirm that we need client_secret.
flow = OAuth2WebServerFlow(
client_id='267793340506.apps.googleusercontent.com',
client_secret='5m8H-zyamfTYg5vnpYu1uGMU',
scope=READER_API,
user_agent='swarm')
callback = self.request.relative_url('/oauth2callback')
authorize_url = flow.step1_get_authorize_url(callback)
memcache.set(user.user_id(), pickle.dumps(flow))
content = template.render('login.html', {'authorize': authorize_url})
self.response.out.write(content)
def handleDev(self):
user = users.get_current_user()
content = template.render('dev.html', {'user': user})
self.response.out.write(content)
def handleUpload(self):
user = users.get_current_user()
content = template.render('upload.html', {'user': user})
self.response.out.write(content)
class UploadFeed(webapp.RequestHandler):
def post(self):
upload_files = self.request.POST.multi.__dict__['_items']
version = self.request.get('version')
logging.info('files: %r' % upload_files)
for data in upload_files:
if data[0] != 'files': continue
file = data[1]
logging.info('upload feed: %r' % file.filename)
data = json.loads(file.value)
feedId = file.filename
feed = Feed.get_or_insert(feedId)
# Find the section to add it to.
sectionTitle = data['section']
section = findSectionByTitle(sectionTitle)
if section != None:
if feed.key() in section.feeds:
logging.warn('Already contains feed %s, replacing' % feedId)
section.feeds.remove(feed.key())
# Add the feed to the section.
section.feeds.insert(0, feed.key())
section.put()
# Add the articles.
collectFeed(feed, data)
else:
logging.error('Could not find section %s to add the feed to' %
sectionTitle)
self.redirect('/')
# TODO(jimhug): Batch these up and request them more aggressively.
class DataHandler(webapp.RequestHandler):
def get(self, name):
if name.endswith('.jpg'):
# Must be a thumbnail
key = urllib2.unquote(name[:-len('.jpg')])
article = Article.get_by_key_name(key)
self.response.headers['Content-Type'] = 'image/jpeg'
# cache images for 10 hours
self.response.headers['Cache-Control'] = 'public,max-age=36000'
article.ensureThumbnail()
self.response.out.write(article.thumbnail)
elif name.endswith('.html'):
# Must be article content
key = urllib2.unquote(name[:-len('.html')])
article = Article.get_by_key_name(key)
self.response.headers['Content-Type'] = 'text/html'
if article is None:
content = '<h2>Missing article</h2>'
else:
content = article.content
# cache article content for 10 hours
self.response.headers['Cache-Control'] = 'public,max-age=36000'
self.response.out.write(content)
elif name == 'user.data':
self.response.out.write(self.getUserData())
elif name == 'CannedData.dart':
self.canData()
elif name == 'CannedData.zip':
self.canDataZip()
else:
self.error(404)
def getUserData(self, articleKeys=None):
user = users.get_current_user()
user_id = user.user_id()
key = 'data_' + user_id
# need to flush memcache fairly frequently...
data = memcache.get(key)
if data is None:
prefs = UserData.get_or_insert(user_id)
if prefs is None:
# TODO(jimhug): Graceful failure for unknown users.
pass
data = prefs.getEncodedData(articleKeys)
# TODO(jimhug): memcache.set(key, data)
return data
def canData(self):
def makeDartSafe(data):
return repr(unicode(data))[1:].replace('$', '\\$')
lines = [
'// TODO(jimhug): Work out correct copyright for this file.',
'class CannedData {'
]
user = users.get_current_user()
prefs = UserData.get_by_key_name(user.user_id())
articleKeys = []
data = prefs.getEncodedData(articleKeys)
lines.append(' static const Map<String,String> data = const {')
for article in db.get(articleKeys):
key = makeDartSafe(urllib.quote(article.key().name()) + '.html')
lines.append(' %s:%s, ' % (key, makeDartSafe(article.content)))
lines.append(' "user.data":%s' % makeDartSafe(data))
lines.append(' };')
lines.append('}')
self.response.headers['Content-Type'] = 'application/dart'
self.response.out.write('\n'.join(lines))
# Get canned static data
def canDataZip(self):
# We need to zip into an in-memory buffer to get the right string encoding
# behavior.
data = StringIO.StringIO()
result = zipfile.ZipFile(data, 'w')
articleKeys = []
result.writestr('data/user.data',
self.getUserData(articleKeys).encode('utf-8'))
logging.info(' adding articles %s' % len(articleKeys))
images = []
for article in db.get(articleKeys):
article.ensureThumbnail()
path = 'data/' + article.key().name() + '.html'
result.writestr(
path.encode('utf-8'), article.content.encode('utf-8'))
if article.thumbnail:
path = 'data/' + article.key().name() + '.jpg'
result.writestr(path.encode('utf-8'), article.thumbnail)
result.close()
logging.info('writing CannedData.zip')
self.response.headers['Content-Type'] = 'multipart/x-zip'
disposition = 'attachment; filename=CannedData.zip'
self.response.headers['Content-Disposition'] = disposition
self.response.out.write(data.getvalue())
data.close()
class SetDefaultFeeds(webapp.RequestHandler):
@login_required
def get(self):
user = users.get_current_user()
prefs = UserData.get_or_insert(user.user_id())
prefs.sections = [
db.Key.from_path('Section', 'user/17857667084667353155/label/Top'),
db.Key.from_path('Section',
'user/17857667084667353155/label/Design'),
db.Key.from_path('Section', 'user/17857667084667353155/label/Eco'),
db.Key.from_path('Section', 'user/17857667084667353155/label/Geek'),
db.Key.from_path('Section',
'user/17857667084667353155/label/Google'),
db.Key.from_path('Section',
'user/17857667084667353155/label/Seattle'),
db.Key.from_path('Section', 'user/17857667084667353155/label/Tech'),
db.Key.from_path('Section', 'user/17857667084667353155/label/Web')
]
prefs.put()
self.redirect('/')
class SetTestFeeds(webapp.RequestHandler):
@login_required
def get(self):
user = users.get_current_user()
prefs = UserData.get_or_insert(user.user_id())
sections = []
for i in range(3):
s1 = Section.get_or_insert('Test%d' % i)
s1.title = 'Section %d' % (i + 1)
feeds = []
for j in range(4):
label = '%d_%d' % (i, j)
f1 = Feed.get_or_insert('Test%s' % label)
f1.title = 'Feed %s' % label
f1.iconUrl = getFeedIcon('http://google.com')
f1.lastUpdated = 0
f1.put()
feeds.append(f1.key())
for k in range(8):
label = '%d_%d_%d' % (i, j, k)
a1 = Article.get_or_insert('Test%s' % label)
if a1.title is None:
a1.feed = f1
a1.title = 'Article %s' % label
a1.author = 'anon'
a1.content = 'Lorem ipsum something or other...'
a1.snippet = 'Lorem ipsum something or other...'
a1.thumbnail = None
a1.srcurl = ''
a1.date = 0
s1.feeds = feeds
s1.put()
sections.append(s1.key())
prefs.sections = sections
prefs.put()
self.redirect('/')
class UserLoginHandler(webapp.RequestHandler):
@login_required
def get(self):
user = users.get_current_user()
prefs = UserData.get_or_insert(user.user_id())
if prefs.credentials:
http = prefs.credentials.authorize(httplib2.Http())
response, content = http.request(
'%s/subscription/list?output=json' % READER_API)
self.collectFeeds(prefs, content)
self.redirect('/')
else:
self.redirect('/login')
def collectFeeds(self, prefs, content):
data = json.loads(content)
queue_name = self.request.get('queue_name', 'priority-queue')
sections = {}
for feedData in data['subscriptions']:
feed = Feed.get_or_insert(feedData['id'])
feed.put()
category = feedData['categories'][0]
categoryId = category['id']
if not sections.has_key(categoryId):
sections[categoryId] = (category['label'], [])
# TODO(jimhug): Use Reader preferences to sort feeds in a section.
sections[categoryId][1].append(feed.key())
# Kick off a high priority feed update
taskqueue.add(
url='/update/feed',
queue_name=queue_name,
params={'id': feed.key().name()})
sectionKeys = []
for name, (title, feeds) in sections.items():
section = Section.get_or_insert(name)
section.feeds = feeds
section.title = title
section.put()
# Forces Top to be the first section
if title == 'Top': title = '0Top'
sectionKeys.append((title, section.key()))
# TODO(jimhug): Use Reader preferences API to get users true sort order.
prefs.sections = [key for t, key in sorted(sectionKeys)]
prefs.put()
class AllFeedsCollector(webapp.RequestHandler):
'''Ensures that a given feed object is locally up to date.'''
def post(self):
return self.get()
def get(self):
queue_name = self.request.get('queue_name', 'background')
for feed in Feed.all():
taskqueue.add(
url='/update/feed',
queue_name=queue_name,
params={'id': feed.key().name()})
UPDATE_COUNT = 4 # The number of articles to request on periodic updates.
INITIAL_COUNT = 40 # The number of articles to get first for a new queue.
SNIPPET_SIZE = 180 # The length of plain-text snippet to extract.
class FeedCollector(webapp.RequestHandler):
def post(self):
return self.get()
def get(self):
feedId = self.request.get('id')
feed = Feed.get_or_insert(feedId)
if feed.lastUpdated is None:
self.fetchn(feed, feedId, INITIAL_COUNT)
else:
self.fetchn(feed, feedId, UPDATE_COUNT)
self.response.headers['Content-Type'] = "text/plain"
def fetchn(self, feed, feedId, n, continuation=None):
# basic pattern is to read by ARTICLE_COUNT until we hit existing.
if continuation is None:
apiUrl = '%s/stream/contents/%s?n=%d' % (READER_API, feedId, n)
else:
apiUrl = '%s/stream/contents/%s?n=%d&c=%s' % (READER_API, feedId, n,
continuation)
logging.info('fetching: %s' % apiUrl)
result = urlfetch.fetch(apiUrl)
if result.status_code == 200:
data = json.loads(result.content)
collectFeed(feed, data, continuation)
elif result.status_code == 401:
self.response.out.write('<pre>%s</pre>' % result.content)
else:
self.response.out.write(result.status_code)
def findSectionByTitle(title):
for section in Section.all():
if section.fixedTitle() == title:
return section
return None
def collectFeed(feed, data, continuation=None):
'''
Reads a feed from the given JSON object and populates the given feed object
in the datastore with its data.
'''
if continuation is None:
if 'alternate' in data:
feed.iconUrl = getFeedIcon(data['alternate'][0]['href'])
feed.title = data['title']
feed.lastUpdated = data['updated']
articles = data['items']
logging.info('%d new articles for %s' % (len(articles), feed.title))
for articleData in articles:
if not collectArticle(feed, articleData):
feed.put()
return False
if len(articles) > 0 and data.has_key('continuation'):
logging.info('would have looked for more articles')
# TODO(jimhug): Enable this continuation check when more robust
#self.fetchn(feed, feedId, data['continuation'])
feed.ensureEncodedFeed(force=True)
feed.put()
return True
def collectArticle(feed, data):
'''
Reads an article from the given JSON object and populates the datastore with
it.
'''
if not 'title' in data:
# Skip this articles without titles
return True
articleId = data['id']
article = Article.get_or_insert(articleId)
# TODO(jimhug): This aborts too early - at lease for one adafruit case.
if article.date == data['published']:
logging.info(
'found existing, aborting: %r, %r' % (articleId, article.date))
return False
if data.has_key('content'):
content = data['content']['content']
elif data.has_key('summary'):
content = data['summary']['content']
else:
content = ''
#TODO(jimhug): better summary?
article.content = content
article.date = data['published']
article.title = unescape(data['title'])
article.snippet = unescape(strip_tags(content)[:SNIPPET_SIZE])
article.feed = feed
# TODO(jimhug): make this canonical so UX can change for this state
article.author = data.get('author', 'anonymous')
article.ensureThumbnail()
article.srcurl = ''
if data.has_key('alternate'):
for alt in data['alternate']:
if alt.has_key('href'):
article.srcurl = alt['href']
return True
def unescape(html):
"Inverse of Django's utils.html.escape function"
if not isinstance(html, basestring):
html = str(html)
html = html.replace('&#39;', "'").replace('&quot;', '"')
return html.replace('&gt;', '>').replace('&lt;', '<').replace('&amp;', '&')
def getFeedIcon(url):
url = urlparse.urlparse(url).netloc
return 'http://s2.googleusercontent.com/s2/favicons?domain=%s&alt=feed' % url
def findImage(text):
img = findImgTag(text, 'jpg|jpeg|png')
if img is not None:
return img
img = findVideoTag(text)
if img is not None:
return img
img = findImgTag(text, 'gif')
return img
def findImgTag(text, extensions):
m = re.search(r'src="(http://\S+\.(%s))(\?.*)?"' % extensions, text)
if m is None:
return None
return m.group(1)
def findVideoTag(text):
# TODO(jimhug): Add other videos beyond youtube.
m = re.search(r'src="http://www.youtube.com/(\S+)/(\S+)[/|"]', text)
if m is None:
return None
return 'http://img.youtube.com/vi/%s/0.jpg' % m.group(2)
def makeThumbnail(text):
url = None
try:
url = findImage(text)
if url is None:
return None
return generateThumbnail(url)
except:
logging.info('error decoding: %s' % (url or text))
return None
def generateThumbnail(url):
logging.info('generating thumbnail: %s' % url)
thumbWidth, thumbHeight = THUMB_SIZE
result = urlfetch.fetch(url)
img = images.Image(result.content)
w, h = img.width, img.height
aspect = float(w) / h
thumbAspect = float(thumbWidth) / thumbHeight
if aspect > thumbAspect:
# Too wide, so crop on the sides.
normalizedCrop = (w - h * thumbAspect) / (2.0 * w)
img.crop(normalizedCrop, 0., 1. - normalizedCrop, 1.)
elif aspect < thumbAspect:
# Too tall, so crop out the bottom.
normalizedCrop = (h - w / thumbAspect) / h
img.crop(0., 0., 1., 1. - normalizedCrop)
img.resize(thumbWidth, thumbHeight)
# Chose JPEG encoding because informal experiments showed it generated
# the best size to quality ratio for thumbnail images.
nimg = img.execute_transforms(output_encoding=images.JPEG)
logging.info(' finished thumbnail: %s' % url)
return nimg
class OAuthHandler(webapp.RequestHandler):
@login_required
def get(self):
user = users.get_current_user()
flow = pickle.loads(memcache.get(user.user_id()))
if flow:
prefs = UserData.get_or_insert(user.user_id())
prefs.credentials = flow.step2_exchange(self.request.params)
prefs.put()
self.redirect('/update/user')
else:
pass
def main():
application = webapp.WSGIApplication(
[
('/data/(.*)', DataHandler),
# This is called periodically from cron.yaml.
('/update/allFeeds', AllFeedsCollector),
('/update/feed', FeedCollector),
('/update/user', UserLoginHandler),
('/update/defaultFeeds', SetDefaultFeeds),
('/update/testFeeds', SetTestFeeds),
('/update/html', UpdateHtml),
('/update/upload', UploadFeed),
('/oauth2callback', OAuthHandler),
('/', TopHandler),
('/(.*)', MainHandler),
],
debug=True)
webapp.util.run_wsgi_app(application)
if __name__ == '__main__':
main()