Screen scraper script that attempts to download album art
This is not very generic. It could be much better. I wrote it for myself, maybe someone else could improve itmain
parent
36cc876558
commit
000f61c4c7
@ -0,0 +1,171 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
This is a program that attempts to get all the album art from amazon
|
||||||
|
as is possible.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import urllib
|
||||||
|
import urllib2
|
||||||
|
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
def walklevel(some_dir, level=1):
|
||||||
|
some_dir = some_dir.rstrip(os.path.sep)
|
||||||
|
assert os.path.isdir(some_dir)
|
||||||
|
num_sep = some_dir.count(os.path.sep)
|
||||||
|
for root, dirs, files in os.walk(some_dir):
|
||||||
|
yield root, dirs, files
|
||||||
|
num_sep_this = root.count(os.path.sep)
|
||||||
|
if num_sep + level <= num_sep_this:
|
||||||
|
del dirs[:]
|
||||||
|
|
||||||
|
def get_search_dirs(base = '.'):
|
||||||
|
ret = []
|
||||||
|
for root, dirs, files in walklevel(base, 0):
|
||||||
|
for the_dir in dirs:
|
||||||
|
new_path = "%s/%s" %(root, the_dir)
|
||||||
|
for root2, dirs2, files2 in walklevel(new_path, 0):
|
||||||
|
for found_dir in dirs2:
|
||||||
|
ret.append("%s/%s" % (new_path, found_dir))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def get_term_from_path(the_dir):
|
||||||
|
"""Gets a search term from the directory"""
|
||||||
|
parts = the_dir.split('/')
|
||||||
|
return {
|
||||||
|
'artist': parts[-2],
|
||||||
|
'album': parts[-1]
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_url_amazon(term):
|
||||||
|
"""Returns the amazon search url get_url"""
|
||||||
|
params = {
|
||||||
|
'url': 'search-alias=digital-music',
|
||||||
|
'x': '0',
|
||||||
|
'y': '0',
|
||||||
|
'field-keywords': term
|
||||||
|
}
|
||||||
|
return "http://www.amazon.com/s/ref=nb_sb_noss?%s" % urllib.urlencode(params)
|
||||||
|
|
||||||
|
def get_url(term):
|
||||||
|
"""Returns the amazon search url get_url"""
|
||||||
|
params = {
|
||||||
|
'mode': 'b',
|
||||||
|
'QT': term
|
||||||
|
}
|
||||||
|
return "http://www.emusic.com/search.html?%s" % urllib.urlencode(params)
|
||||||
|
|
||||||
|
def get_album_img_src_amazon(url):
|
||||||
|
"""parses the url and find the link for the album"""
|
||||||
|
html = urllib2.urlopen(url)
|
||||||
|
soup = BeautifulSoup(html)
|
||||||
|
imgSrc = soup.find('a', {'id': 'mp3StoreShovelerShvlLink0'}).img['src']
|
||||||
|
imgParts = imgSrc.split('_');
|
||||||
|
imgParts.pop(-2)
|
||||||
|
return '_'.join(imgParts)
|
||||||
|
|
||||||
|
def get_album_img_srcs_emusic(url):
|
||||||
|
"""parses the url and find the link for the album"""
|
||||||
|
html = urllib2.urlopen(url)
|
||||||
|
soup = BeautifulSoup(html)
|
||||||
|
|
||||||
|
ul = soup.find('ul', {'class': 'resultList'})
|
||||||
|
li = ul.findAll('li')[0]
|
||||||
|
img = li.find('img')
|
||||||
|
img_src = img['src']
|
||||||
|
|
||||||
|
img_parts = img_src.split('/')
|
||||||
|
img_parts[-1] = '600x600.jpg'
|
||||||
|
large_img = '/'.join(img_parts)
|
||||||
|
img_parts[-1] = '1400x1400.jpg'
|
||||||
|
huge_img = '/'.join(img_parts)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'large': large_img,
|
||||||
|
'huge': huge_img
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_name(info, size):
|
||||||
|
"""docstring for get_file_name"""
|
||||||
|
return 'imgs/%s.%s.%d.jpg' % (info['artist'], info['album'], size)
|
||||||
|
|
||||||
|
def save_file(in_stream, size, info):
|
||||||
|
"""docstring for save_file"""
|
||||||
|
file_name = get_file_name(info, size)
|
||||||
|
output = open(file_name, 'wb')
|
||||||
|
output.write(in_stream.read())
|
||||||
|
output.close()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""The main script."""
|
||||||
|
|
||||||
|
# Get the path for the directory
|
||||||
|
try:
|
||||||
|
base_dir = sys.argv[1]
|
||||||
|
except IndexError:
|
||||||
|
print "Must enter path of directory to search. Example: %s ./foo" % \
|
||||||
|
sys.argv[0]
|
||||||
|
os.abort()
|
||||||
|
|
||||||
|
if os.path.exists(base_dir) == False:
|
||||||
|
print "Could not find path: '%s'" % base_dir
|
||||||
|
os.abort()
|
||||||
|
|
||||||
|
# Remove trailing /
|
||||||
|
if base_dir[-1] == "/":
|
||||||
|
base_dir = base_dir[:-1]
|
||||||
|
|
||||||
|
search_dirs = get_search_dirs(base_dir)
|
||||||
|
|
||||||
|
for the_dir in search_dirs:
|
||||||
|
|
||||||
|
search_terms = get_term_from_path(the_dir)
|
||||||
|
|
||||||
|
found_file = False
|
||||||
|
#for tmp_size in (1400, 600, 500):
|
||||||
|
for tmp_size in (500, ):
|
||||||
|
tmp_file_name = get_file_name(search_terms, tmp_size)
|
||||||
|
if os.path.exists(tmp_file_name):
|
||||||
|
print "Skipping check, found '%s'" % tmp_file_name
|
||||||
|
found_file = True
|
||||||
|
break
|
||||||
|
if found_file:
|
||||||
|
continue
|
||||||
|
|
||||||
|
#img_src = None
|
||||||
|
#url = get_url(search_terms['album'])
|
||||||
|
#try:
|
||||||
|
# img_src = get_album_img_srcs_emusic(url)
|
||||||
|
#except:
|
||||||
|
# img_src = None
|
||||||
|
|
||||||
|
#if img_src != None:
|
||||||
|
# img_file = urllib2.urlopen(img_src['huge'])
|
||||||
|
# if img_file.info().getmaintype() == 'image':
|
||||||
|
# save_file(img_file, 1400, search_terms)
|
||||||
|
# print "Saved huge emusic cover for %s" % search_terms['album']
|
||||||
|
# continue
|
||||||
|
|
||||||
|
# img_file = urllib2.urlopen(img_src['large'])
|
||||||
|
# if img_file.info().getmaintype() == 'image':
|
||||||
|
# save_file(img_file, 600, search_terms)
|
||||||
|
# print "Saved large emusic cover for %s" % search_terms['album']
|
||||||
|
# continue
|
||||||
|
|
||||||
|
# If emusic doesn't have it, try amazon
|
||||||
|
url = get_url_amazon("%s %s" % (search_terms['artist'], search_terms['album']))
|
||||||
|
try:
|
||||||
|
img_src = get_album_img_src_amazon(url)
|
||||||
|
img_file = urllib2.urlopen(img_src)
|
||||||
|
except:
|
||||||
|
print "Skipped, could not find cover art for %s" % search_terms['album']
|
||||||
|
continue
|
||||||
|
save_file(img_file, 500, search_terms)
|
||||||
|
print "Saved large amazon cover for %s" % search_terms['album']
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
Loading…
Reference in New Issue