'''
Scrape Equestria Daily posts through the Google/Blogger API.  This retrieves
post metadata starting from the most recent post and stores it in json files.

For normal operation, just run this script with no arguments.

To resume a previous session, look for output like this from the previous run:

```
  writing 100 posts to posts-123.json
reading posts at 'xxxxxxxxxxxxxxxxxxxxxxxxxxxx'
```

And run the script as `python3 scrape_blog.py 124 xxxxxxxxxxxxxxxxxxxxxxxxxxxx`
to resume from that point.
'''

import json
from pprint import pprint
import sys
import time
from urllib.request import urlopen
from urllib.parse import urlencode

with open('api-key.txt') as f:
    API_KEY = f.read().strip()

# The internal ID of the Equestria Daily blog.
BLOG_ID = 1865775941337421610

def post_list_url(fields, labels, max_results=20, page_token=None):
    args = {
            'key': API_KEY,
            'maxResults': max_results,
            'labels': ','.join(labels),
            'fields': 'nextPageToken,items(%s)' % ','.join(fields),
            }
    if page_token is not None:
        args['pageToken'] = page_token

    return ('https://www.googleapis.com/blogger/v3/blogs/%s/posts?%s' %
            (BLOG_ID, urlencode(args)))

def post_list(fields, labels, max_results=20, page_token=None):
    url = post_list_url(fields, labels, max_results, page_token)
    data = urlopen(url).read().decode('utf-8')
    j = json.loads(data)
    return (j.get('nextPageToken'), j['items'])


def main():
    if len(sys.argv) > 1:
        i = int(sys.argv[1])
        next_page = sys.argv[2]
    else:
        i = 0
        next_page = None
    while True:
        print('reading posts at %r' % next_page)
        next_page, posts = post_list(
                fields=('id', 'url', 'published', 'labels', 'title', 'content'),
                labels=('Music',),
                max_results=100,
                page_token=next_page)
        if len(posts) == 0:
            break

        filename = 'posts-%03d.json' % i
        print('  oldest: %s' % posts[-1]['published'])
        print('  writing %d posts to %s' % (len(posts), filename))
        with open(filename, 'w') as f:
            json.dump(posts, f)
        i += 1
        time.sleep(1.5)

        if next_page is None:
            break


if __name__ == '__main__':
    main()


