Screen scraper script that attempts to download album art

This is not very generic. It could be much better. I wrote it for myself, maybe someone else could improve it
13 years ago · 000f61c4c7
parent 36cc876558
commit 000f61c4c7
1 changed files with 171 additions and 0 deletions
--- a/bin/get-amazon-art
+++ b/bin/get-amazon-art
@ -0,0 +1,171 @@
+#!/usr/bin/env python
+"""
+This is a program that attempts to get all the album art from amazon
+as is possible.
+"""
+
+import os
+import sys
+import urllib
+import urllib2
+
+from BeautifulSoup import BeautifulSoup
+
+def walklevel(some_dir, level=1):
+    some_dir = some_dir.rstrip(os.path.sep)
+    assert os.path.isdir(some_dir)
+    num_sep = some_dir.count(os.path.sep)
+    for root, dirs, files in os.walk(some_dir):
+        yield root, dirs, files
+        num_sep_this = root.count(os.path.sep)
+        if num_sep + level <= num_sep_this:
+            del dirs[:]
+
+def get_search_dirs(base = '.'):
+    ret = []
+    for root, dirs, files in walklevel(base, 0):
+        for the_dir in dirs:
+            new_path = "%s/%s" %(root, the_dir)
+            for root2, dirs2, files2 in walklevel(new_path, 0):
+                for found_dir in dirs2:
+                    ret.append("%s/%s" % (new_path, found_dir))
+    return ret
+
+def get_term_from_path(the_dir):
+    """Gets a search term from the directory"""
+    parts = the_dir.split('/')
+    return {
+        'artist': parts[-2],
+        'album': parts[-1]
+    }
+
+def get_url_amazon(term):
+    """Returns the amazon search url get_url"""
+    params = {
+        'url': 'search-alias=digital-music',
+        'x': '0',
+        'y': '0',
+        'field-keywords': term
+    }
+    return "http://www.amazon.com/s/ref=nb_sb_noss?%s" % urllib.urlencode(params)
+
+def get_url(term):
+    """Returns the amazon search url get_url"""
+    params = {
+        'mode': 'b',
+        'QT': term
+    }
+    return "http://www.emusic.com/search.html?%s" % urllib.urlencode(params)
+
+def get_album_img_src_amazon(url):
+    """parses the url and find the link for the album"""
+    html = urllib2.urlopen(url)
+    soup = BeautifulSoup(html)
+    imgSrc = soup.find('a', {'id': 'mp3StoreShovelerShvlLink0'}).img['src']
+    imgParts = imgSrc.split('_');
+    imgParts.pop(-2)
+    return '_'.join(imgParts)
+
+def get_album_img_srcs_emusic(url):
+    """parses the url and find the link for the album"""
+    html = urllib2.urlopen(url)
+    soup = BeautifulSoup(html)
+
+    ul = soup.find('ul', {'class': 'resultList'})
+    li = ul.findAll('li')[0]
+    img = li.find('img')
+    img_src = img['src']
+
+    img_parts = img_src.split('/')
+    img_parts[-1] = '600x600.jpg'
+    large_img = '/'.join(img_parts)
+    img_parts[-1] = '1400x1400.jpg'
+    huge_img = '/'.join(img_parts)
+
+    return {
+        'large': large_img,
+        'huge': huge_img
+    }
+
+
+def get_file_name(info, size):
+    """docstring for get_file_name"""
+    return 'imgs/%s.%s.%d.jpg' % (info['artist'], info['album'], size)
+
+def save_file(in_stream, size, info):
+    """docstring for save_file"""
+    file_name = get_file_name(info, size)
+    output = open(file_name, 'wb')
+    output.write(in_stream.read())
+    output.close()
+
+def main():
+    """The main script."""
+
+    # Get the path for the directory
+    try:
+        base_dir = sys.argv[1]
+    except IndexError:
+        print "Must enter path of directory to search. Example: %s ./foo" % \
+                sys.argv[0]
+        os.abort()
+
+    if os.path.exists(base_dir) == False:
+        print "Could not find path: '%s'" % base_dir
+        os.abort()
+
+    # Remove trailing /
+    if base_dir[-1] == "/":
+        base_dir = base_dir[:-1]
+
+    search_dirs = get_search_dirs(base_dir)
+
+    for the_dir in search_dirs:
+
+        search_terms = get_term_from_path(the_dir)
+
+        found_file = False
+        #for tmp_size in (1400, 600, 500):
+        for tmp_size in (500, ):
+            tmp_file_name = get_file_name(search_terms, tmp_size)
+            if os.path.exists(tmp_file_name):
+                print "Skipping check, found '%s'" % tmp_file_name
+                found_file = True
+                break
+        if found_file:
+            continue
+
+        #img_src = None
+        #url = get_url(search_terms['album'])
+        #try:
+        #    img_src = get_album_img_srcs_emusic(url)
+        #except:
+        #    img_src = None
+
+        #if img_src != None:
+        #    img_file = urllib2.urlopen(img_src['huge'])
+        #    if img_file.info().getmaintype() == 'image':
+        #        save_file(img_file, 1400, search_terms)
+        #        print "Saved huge emusic cover for %s" % search_terms['album']
+        #        continue
+
+        #    img_file = urllib2.urlopen(img_src['large'])
+        #    if img_file.info().getmaintype() == 'image':
+        #        save_file(img_file, 600, search_terms)
+        #        print "Saved large emusic cover for %s" % search_terms['album']
+        #        continue
+
+        # If emusic doesn't have it, try amazon
+        url = get_url_amazon("%s %s" % (search_terms['artist'], search_terms['album']))
+        try:
+            img_src = get_album_img_src_amazon(url)
+            img_file = urllib2.urlopen(img_src)
+        except:
+            print "Skipped, could not find cover art for %s" % search_terms['album']
+            continue
+        save_file(img_file, 500, search_terms)
+        print "Saved large amazon cover for %s" % search_terms['album']
+
+if __name__ == "__main__":
+    main()
+