'''
Read embed/link URLs from `embeds.json` and print the ones that appear to be
real Youtube, Bandcamp, or Soundcloud links.
'''
import json
from urllib.parse import urlparse
import sys

HOST_BLACKLIST = {
        'ytimg.com',
        'deviantart.com',
        '1.bp.blogspot.com',
        '4cdn.org',
        'twitter.com',
        'discord.gg',
        'ponemusic.net',
        'equestriadaily.com',
        'instagram.com',
        'fifteen.ai',
        'pony.town',
        'fimfiction.net',
        'i.ibb.co',
        'albdifferent.com',
        'harmonycon.org',
        'mlp.fandom.com',
        'mylittlewiki.org',
        'wikipedia.org',
        'ponyvilleciderfest.com',
        'twitch.tv',
        'tumblr.com',
        'amazon.com',
        'mlpfictions.com',
        'steampowered.com',
        'derpibooru.org',
        'legendsofequestria.com',
        'twitlonger.com',
        'pastebin.com',
        'wordpress.com',
        'teamtrees.org',
        'vod.dlife.disney.co.jp',
        'bsckids.com',
}

HOST_WHITELIST = {
        'youtu.be',
        'youtube.com',
        'bandcamp.com',
        'soundcloud.com',
        'ponyvillefm.com',
        'horsemusicherald.com',
        'mumbleetc.com',
        'pony.fm',
        'marcuswarnermusic.com',
        'itch.io',
        'mixcloud.com',
        'noise.horse',
        'aminoapps.com',
        'spotify.com',
        'fillyradio.com',
        'fillydelphiaradio.net',
        'newgrounds.com',
        'fancynoise.net',
        'drive.google.com',
        'docs.google.com',
        'mediafire.com',
        'ponyfamily.000webhostapp.com',
        'feed.podbean.com',
        'vimeo.com',
        'qrates.com',
        'forestrainmedia.com',
        'onedrive.live.com',
        'rainwatertornado.cloud',
}

def check_host(h):
    if h in HOST_WHITELIST:
        return True
    elif h in HOST_BLACKLIST:
        return False

    parts = h.strip('.').split('.')
    for i in range(1, len(parts)):
        suffix = '.'.join(parts[-i:])
        if suffix in HOST_WHITELIST:
            HOST_WHITELIST.add(h)
            return True
        elif suffix in HOST_BLACKLIST:
            HOST_BLACKLIST.add(h)
            return False
    return None

def accept_url(u):
    x = urlparse(u)

    ok = check_host(x.hostname)
    if ok is None:
        print('unknown url: %s' % (u,), file=sys.stderr)
        return True
    return ok

def main():
    with open('embeds.json') as f:
        j = json.load(f)

    seen = set()

    for link in j:
        url = link['url']
        if not accept_url(url):
            continue
        if url in seen:
            continue
        seen.add(url)
        print(url)

if __name__ == '__main__':
    main()

