'''
Extract iframe and link URLs from blog posts.  This expects to see a bunch of
files name `posts-*.json` in the current directory, as are produced by
`scrape_blog.py`.  The output is a new file, `embeds.json`, containing all the
URLs and information about where they were found.
'''


import json
from pprint import pprint
import os
import sys

from bs4 import BeautifulSoup


def find_embeds(html):
    soup = BeautifulSoup(html, 'lxml')
    embeds = []

    for iframe in soup.find_all('iframe'):
        src = iframe.get('src')
        if src:
            embeds.append(('embed', src))
        else:
            print('warning: no src in iframe')

    for link in soup.find_all('a'):
        href = link.get('href')
        if href:
            embeds.append(('link', href))

    return embeds


def main():
    all_embeds = []

    files = sorted(os.listdir('.'))
    for filename in files:
        if not filename.startswith('posts-') or not filename.endswith('.json'):
            continue

        print('processing %s' % filename)

        with open(filename) as f:
            j = json.load(f)

        for i, post in enumerate(j):
            embeds = find_embeds(post['content'])
            for kind, url in embeds:
                all_embeds.append({
                    'post_file': filename,
                    'post_index': i,
                    'kind': kind,
                    'url': url,
                })

    print('collected %d embeds, %d links' % (
        sum(1 for e in all_embeds if e['kind'] == 'embed'),
        sum(1 for e in all_embeds if e['kind'] == 'link')))
    with open('embeds.json', 'w') as f:
        json.dump(all_embeds, f)


if __name__ == '__main__':
    main()


