dot-files/old-scripts/.local/bin/get-amazon-art

#!/usr/bin/env python
"""
This is a program that attempts to get all the album art from amazon
as is possible.
"""

import os
import sys
import urllib
import urllib2

from BeautifulSoup import BeautifulSoup

def walklevel(some_dir, level=1):
    some_dir = some_dir.rstrip(os.path.sep)
    assert os.path.isdir(some_dir)
    num_sep = some_dir.count(os.path.sep)
    for root, dirs, files in os.walk(some_dir):
        yield root, dirs, files
        num_sep_this = root.count(os.path.sep)
        if num_sep + level <= num_sep_this:
            del dirs[:]

def get_search_dirs(base = '.'):
    ret = []
    for root, dirs, files in walklevel(base, 0):
        for the_dir in dirs:
            new_path = "%s/%s" %(root, the_dir)
            for root2, dirs2, files2 in walklevel(new_path, 0):
                for found_dir in dirs2:
                    ret.append("%s/%s" % (new_path, found_dir))
    return ret

def get_term_from_path(the_dir):
    """Gets a search term from the directory"""
    parts = the_dir.split('/')
    return {
        'artist': parts[-2],
        'album': parts[-1]
    }

def get_url_amazon(term):
    """Returns the amazon search url get_url"""
    params = {
        'url': 'search-alias=digital-music',
        'x': '0',
        'y': '0',
        'field-keywords': term
    }
    return "http://www.amazon.com/s/ref=nb_sb_noss?%s" % urllib.urlencode(params)

def get_url(term):
    """Returns the amazon search url get_url"""
    params = {
        'mode': 'b',
        'QT': term
    }
    return "http://www.emusic.com/search.html?%s" % urllib.urlencode(params)

def get_album_img_src_amazon(url):
    """parses the url and find the link for the album"""
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html)
    imgSrc = soup.find('a', {'id': 'mp3StoreShovelerShvlLink0'}).img['src']
    imgParts = imgSrc.split('_');
    imgParts.pop(-2)
    return '_'.join(imgParts)

def get_album_img_srcs_emusic(url):
    """parses the url and find the link for the album"""
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html)

    ul = soup.find('ul', {'class': 'resultList'})
    li = ul.findAll('li')[0]
    img = li.find('img')
    img_src = img['src']

    img_parts = img_src.split('/')
    img_parts[-1] = '600x600.jpg'
    large_img = '/'.join(img_parts)
    img_parts[-1] = '1400x1400.jpg'
    huge_img = '/'.join(img_parts)

    return {
        'large': large_img,
        'huge': huge_img
    }


def get_file_name(info, size):
    """docstring for get_file_name"""
    return 'imgs/%s.%s.%d.jpg' % (info['artist'], info['album'], size)

def save_file(in_stream, size, info):
    """docstring for save_file"""
    file_name = get_file_name(info, size)
    output = open(file_name, 'wb')
    output.write(in_stream.read())
    output.close()

def main():
    """The main script."""

    # Get the path for the directory
    try:
        base_dir = sys.argv[1]
    except IndexError:
        print "Must enter path of directory to search. Example: %s ./foo" % \
                sys.argv[0]
        os.abort()

    if os.path.exists(base_dir) == False:
        print "Could not find path: '%s'" % base_dir
        os.abort()

    # Remove trailing /
    if base_dir[-1] == "/":
        base_dir = base_dir[:-1]

    search_dirs = get_search_dirs(base_dir)

    for the_dir in search_dirs:

        search_terms = get_term_from_path(the_dir)

        found_file = False
        #for tmp_size in (1400, 600, 500):
        for tmp_size in (500, ):
            tmp_file_name = get_file_name(search_terms, tmp_size)
            if os.path.exists(tmp_file_name):
                print "Skipping check, found '%s'" % tmp_file_name
                found_file = True
                break
        if found_file:
            continue

        #img_src = None
        #url = get_url(search_terms['album'])
        #try:
        #    img_src = get_album_img_srcs_emusic(url)
        #except:
        #    img_src = None

        #if img_src != None:
        #    img_file = urllib2.urlopen(img_src['huge'])
        #    if img_file.info().getmaintype() == 'image':
        #        save_file(img_file, 1400, search_terms)
        #        print "Saved huge emusic cover for %s" % search_terms['album']
        #        continue

        #    img_file = urllib2.urlopen(img_src['large'])
        #    if img_file.info().getmaintype() == 'image':
        #        save_file(img_file, 600, search_terms)
        #        print "Saved large emusic cover for %s" % search_terms['album']
        #        continue

        # If emusic doesn't have it, try amazon
        url = get_url_amazon("%s %s" % (search_terms['artist'], search_terms['album']))
        try:
            img_src = get_album_img_src_amazon(url)
            img_file = urllib2.urlopen(img_src)
        except:
            print "Skipped, could not find cover art for %s" % search_terms['album']
            continue
        save_file(img_file, 500, search_terms)
        print "Saved large amazon cover for %s" % search_terms['album']

if __name__ == "__main__":
    main()
Screen scraper script that attempts to download album art This is not very generic. It could be much better. I wrote it for myself, maybe someone else could improve it 12 years ago			`#!/usr/bin/env python`
			`"""`
			`This is a program that attempts to get all the album art from amazon`
			`as is possible.`
			`"""`

			`import os`
			`import sys`
			`import urllib`
			`import urllib2`

			`from BeautifulSoup import BeautifulSoup`

			`def walklevel(some_dir, level=1):`
			`some_dir = some_dir.rstrip(os.path.sep)`
			`assert os.path.isdir(some_dir)`
			`num_sep = some_dir.count(os.path.sep)`
			`for root, dirs, files in os.walk(some_dir):`
			`yield root, dirs, files`
			`num_sep_this = root.count(os.path.sep)`
			`if num_sep + level <= num_sep_this:`
			`del dirs[:]`

			`def get_search_dirs(base = '.'):`
			`ret = []`
			`for root, dirs, files in walklevel(base, 0):`
			`for the_dir in dirs:`
			`new_path = "%s/%s" %(root, the_dir)`
			`for root2, dirs2, files2 in walklevel(new_path, 0):`
			`for found_dir in dirs2:`
			`ret.append("%s/%s" % (new_path, found_dir))`
			`return ret`

			`def get_term_from_path(the_dir):`
			`"""Gets a search term from the directory"""`
			`parts = the_dir.split('/')`
			`return {`
			`'artist': parts[-2],`
			`'album': parts[-1]`
			`}`

			`def get_url_amazon(term):`
			`"""Returns the amazon search url get_url"""`
			`params = {`
			`'url': 'search-alias=digital-music',`
			`'x': '0',`
			`'y': '0',`
			`'field-keywords': term`
			`}`
			`return "http://www.amazon.com/s/ref=nb_sb_noss?%s" % urllib.urlencode(params)`

			`def get_url(term):`
			`"""Returns the amazon search url get_url"""`
			`params = {`
			`'mode': 'b',`
			`'QT': term`
			`}`
			`return "http://www.emusic.com/search.html?%s" % urllib.urlencode(params)`

			`def get_album_img_src_amazon(url):`
			`"""parses the url and find the link for the album"""`
			`html = urllib2.urlopen(url)`
			`soup = BeautifulSoup(html)`
			`imgSrc = soup.find('a', {'id': 'mp3StoreShovelerShvlLink0'}).img['src']`
			`imgParts = imgSrc.split('_');`
			`imgParts.pop(-2)`
			`return '_'.join(imgParts)`

			`def get_album_img_srcs_emusic(url):`
			`"""parses the url and find the link for the album"""`
			`html = urllib2.urlopen(url)`
			`soup = BeautifulSoup(html)`

			`ul = soup.find('ul', {'class': 'resultList'})`
			`li = ul.findAll('li')[0]`
			`img = li.find('img')`
			`img_src = img['src']`

			`img_parts = img_src.split('/')`
			`img_parts[-1] = '600x600.jpg'`
			`large_img = '/'.join(img_parts)`
			`img_parts[-1] = '1400x1400.jpg'`
			`huge_img = '/'.join(img_parts)`

			`return {`
			`'large': large_img,`
			`'huge': huge_img`
			`}`


			`def get_file_name(info, size):`
			`"""docstring for get_file_name"""`
			`return 'imgs/%s.%s.%d.jpg' % (info['artist'], info['album'], size)`

			`def save_file(in_stream, size, info):`
			`"""docstring for save_file"""`
			`file_name = get_file_name(info, size)`
			`output = open(file_name, 'wb')`
			`output.write(in_stream.read())`
			`output.close()`

			`def main():`
			`"""The main script."""`

			`# Get the path for the directory`
			`try:`
			`base_dir = sys.argv[1]`
			`except IndexError:`
			`print "Must enter path of directory to search. Example: %s ./foo" % \`
			`sys.argv[0]`
			`os.abort()`

			`if os.path.exists(base_dir) == False:`
			`print "Could not find path: '%s'" % base_dir`
			`os.abort()`

			`# Remove trailing /`
			`if base_dir[-1] == "/":`
			`base_dir = base_dir[:-1]`

			`search_dirs = get_search_dirs(base_dir)`

			`for the_dir in search_dirs:`

			`search_terms = get_term_from_path(the_dir)`

			`found_file = False`
			`#for tmp_size in (1400, 600, 500):`
			`for tmp_size in (500, ):`
			`tmp_file_name = get_file_name(search_terms, tmp_size)`
			`if os.path.exists(tmp_file_name):`
			`print "Skipping check, found '%s'" % tmp_file_name`
			`found_file = True`
			`break`
			`if found_file:`
			`continue`

			`#img_src = None`
			`#url = get_url(search_terms['album'])`
			`#try:`
			`# img_src = get_album_img_srcs_emusic(url)`
			`#except:`
			`# img_src = None`

			`#if img_src != None:`
			`# img_file = urllib2.urlopen(img_src['huge'])`
			`# if img_file.info().getmaintype() == 'image':`
			`# save_file(img_file, 1400, search_terms)`
			`# print "Saved huge emusic cover for %s" % search_terms['album']`
			`# continue`

			`# img_file = urllib2.urlopen(img_src['large'])`
			`# if img_file.info().getmaintype() == 'image':`
			`# save_file(img_file, 600, search_terms)`
			`# print "Saved large emusic cover for %s" % search_terms['album']`
			`# continue`

			`# If emusic doesn't have it, try amazon`
			`url = get_url_amazon("%s %s" % (search_terms['artist'], search_terms['album']))`
			`try:`
			`img_src = get_album_img_src_amazon(url)`
			`img_file = urllib2.urlopen(img_src)`
			`except:`
			`print "Skipped, could not find cover art for %s" % search_terms['album']`
			`continue`
			`save_file(img_file, 500, search_terms)`
			`print "Saved large amazon cover for %s" % search_terms['album']`

			`if __name__ == "__main__":`
			`main()`