From 000f61c4c7fb427bd4c876568105be2641955e31 Mon Sep 17 00:00:00 2001 From: Buddy Sandidge Date: Sun, 4 Nov 2012 11:46:32 -0800 Subject: [PATCH] Screen scraper script that attempts to download album art This is not very generic. It could be much better. I wrote it for myself, maybe someone else could improve it --- bin/get-amazon-art | 171 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100755 bin/get-amazon-art diff --git a/bin/get-amazon-art b/bin/get-amazon-art new file mode 100755 index 0000000..df09ae2 --- /dev/null +++ b/bin/get-amazon-art @@ -0,0 +1,171 @@ +#!/usr/bin/env python +""" +This is a program that attempts to get all the album art from amazon +as is possible. +""" + +import os +import sys +import urllib +import urllib2 + +from BeautifulSoup import BeautifulSoup + +def walklevel(some_dir, level=1): + some_dir = some_dir.rstrip(os.path.sep) + assert os.path.isdir(some_dir) + num_sep = some_dir.count(os.path.sep) + for root, dirs, files in os.walk(some_dir): + yield root, dirs, files + num_sep_this = root.count(os.path.sep) + if num_sep + level <= num_sep_this: + del dirs[:] + +def get_search_dirs(base = '.'): + ret = [] + for root, dirs, files in walklevel(base, 0): + for the_dir in dirs: + new_path = "%s/%s" %(root, the_dir) + for root2, dirs2, files2 in walklevel(new_path, 0): + for found_dir in dirs2: + ret.append("%s/%s" % (new_path, found_dir)) + return ret + +def get_term_from_path(the_dir): + """Gets a search term from the directory""" + parts = the_dir.split('/') + return { + 'artist': parts[-2], + 'album': parts[-1] + } + +def get_url_amazon(term): + """Returns the amazon search url get_url""" + params = { + 'url': 'search-alias=digital-music', + 'x': '0', + 'y': '0', + 'field-keywords': term + } + return "http://www.amazon.com/s/ref=nb_sb_noss?%s" % urllib.urlencode(params) + +def get_url(term): + """Returns the amazon search url get_url""" + params = { + 'mode': 'b', + 'QT': term + } + return "http://www.emusic.com/search.html?%s" % urllib.urlencode(params) + +def get_album_img_src_amazon(url): + """parses the url and find the link for the album""" + html = urllib2.urlopen(url) + soup = BeautifulSoup(html) + imgSrc = soup.find('a', {'id': 'mp3StoreShovelerShvlLink0'}).img['src'] + imgParts = imgSrc.split('_'); + imgParts.pop(-2) + return '_'.join(imgParts) + +def get_album_img_srcs_emusic(url): + """parses the url and find the link for the album""" + html = urllib2.urlopen(url) + soup = BeautifulSoup(html) + + ul = soup.find('ul', {'class': 'resultList'}) + li = ul.findAll('li')[0] + img = li.find('img') + img_src = img['src'] + + img_parts = img_src.split('/') + img_parts[-1] = '600x600.jpg' + large_img = '/'.join(img_parts) + img_parts[-1] = '1400x1400.jpg' + huge_img = '/'.join(img_parts) + + return { + 'large': large_img, + 'huge': huge_img + } + + +def get_file_name(info, size): + """docstring for get_file_name""" + return 'imgs/%s.%s.%d.jpg' % (info['artist'], info['album'], size) + +def save_file(in_stream, size, info): + """docstring for save_file""" + file_name = get_file_name(info, size) + output = open(file_name, 'wb') + output.write(in_stream.read()) + output.close() + +def main(): + """The main script.""" + + # Get the path for the directory + try: + base_dir = sys.argv[1] + except IndexError: + print "Must enter path of directory to search. Example: %s ./foo" % \ + sys.argv[0] + os.abort() + + if os.path.exists(base_dir) == False: + print "Could not find path: '%s'" % base_dir + os.abort() + + # Remove trailing / + if base_dir[-1] == "/": + base_dir = base_dir[:-1] + + search_dirs = get_search_dirs(base_dir) + + for the_dir in search_dirs: + + search_terms = get_term_from_path(the_dir) + + found_file = False + #for tmp_size in (1400, 600, 500): + for tmp_size in (500, ): + tmp_file_name = get_file_name(search_terms, tmp_size) + if os.path.exists(tmp_file_name): + print "Skipping check, found '%s'" % tmp_file_name + found_file = True + break + if found_file: + continue + + #img_src = None + #url = get_url(search_terms['album']) + #try: + # img_src = get_album_img_srcs_emusic(url) + #except: + # img_src = None + + #if img_src != None: + # img_file = urllib2.urlopen(img_src['huge']) + # if img_file.info().getmaintype() == 'image': + # save_file(img_file, 1400, search_terms) + # print "Saved huge emusic cover for %s" % search_terms['album'] + # continue + + # img_file = urllib2.urlopen(img_src['large']) + # if img_file.info().getmaintype() == 'image': + # save_file(img_file, 600, search_terms) + # print "Saved large emusic cover for %s" % search_terms['album'] + # continue + + # If emusic doesn't have it, try amazon + url = get_url_amazon("%s %s" % (search_terms['artist'], search_terms['album'])) + try: + img_src = get_album_img_src_amazon(url) + img_file = urllib2.urlopen(img_src) + except: + print "Skipped, could not find cover art for %s" % search_terms['album'] + continue + save_file(img_file, 500, search_terms) + print "Saved large amazon cover for %s" % search_terms['album'] + +if __name__ == "__main__": + main() +