mirror of
https://github.com/clinton-hall/nzbToMedia.git
synced 2025-08-14 02:26:53 -07:00
Move common libs to libs/common
This commit is contained in:
parent
8dbb1a2451
commit
1f4bd41bcc
1612 changed files with 962 additions and 10 deletions
904
libs/common/beetsplug/lyrics.py
Normal file
904
libs/common/beetsplug/lyrics.py
Normal file
|
@ -0,0 +1,904 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# This file is part of beets.
|
||||
# Copyright 2016, Adrian Sampson.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining
|
||||
# a copy of this software and associated documentation files (the
|
||||
# "Software"), to deal in the Software without restriction, including
|
||||
# without limitation the rights to use, copy, modify, merge, publish,
|
||||
# distribute, sublicense, and/or sell copies of the Software, and to
|
||||
# permit persons to whom the Software is furnished to do so, subject to
|
||||
# the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be
|
||||
# included in all copies or substantial portions of the Software.
|
||||
|
||||
"""Fetches, embeds, and displays lyrics.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import difflib
|
||||
import errno
|
||||
import itertools
|
||||
import json
|
||||
import struct
|
||||
import os.path
|
||||
import re
|
||||
import requests
|
||||
import unicodedata
|
||||
from unidecode import unidecode
|
||||
import warnings
|
||||
import six
|
||||
from six.moves import urllib
|
||||
|
||||
try:
|
||||
from bs4 import SoupStrainer, BeautifulSoup
|
||||
HAS_BEAUTIFUL_SOUP = True
|
||||
except ImportError:
|
||||
HAS_BEAUTIFUL_SOUP = False
|
||||
|
||||
try:
|
||||
import langdetect
|
||||
HAS_LANGDETECT = True
|
||||
except ImportError:
|
||||
HAS_LANGDETECT = False
|
||||
|
||||
try:
|
||||
# PY3: HTMLParseError was removed in 3.5 as strict mode
|
||||
# was deprecated in 3.3.
|
||||
# https://docs.python.org/3.3/library/html.parser.html
|
||||
from six.moves.html_parser import HTMLParseError
|
||||
except ImportError:
|
||||
class HTMLParseError(Exception):
|
||||
pass
|
||||
|
||||
from beets import plugins
|
||||
from beets import ui
|
||||
import beets
|
||||
|
||||
DIV_RE = re.compile(r'<(/?)div>?', re.I)
|
||||
COMMENT_RE = re.compile(r'<!--.*-->', re.S)
|
||||
TAG_RE = re.compile(r'<[^>]*>')
|
||||
BREAK_RE = re.compile(r'\n?\s*<br([\s|/][^>]*)*>\s*\n?', re.I)
|
||||
URL_CHARACTERS = {
|
||||
u'\u2018': u"'",
|
||||
u'\u2019': u"'",
|
||||
u'\u201c': u'"',
|
||||
u'\u201d': u'"',
|
||||
u'\u2010': u'-',
|
||||
u'\u2011': u'-',
|
||||
u'\u2012': u'-',
|
||||
u'\u2013': u'-',
|
||||
u'\u2014': u'-',
|
||||
u'\u2015': u'-',
|
||||
u'\u2016': u'-',
|
||||
u'\u2026': u'...',
|
||||
}
|
||||
USER_AGENT = 'beets/{}'.format(beets.__version__)
|
||||
|
||||
# The content for the base index.rst generated in ReST mode.
|
||||
REST_INDEX_TEMPLATE = u'''Lyrics
|
||||
======
|
||||
|
||||
* :ref:`Song index <genindex>`
|
||||
* :ref:`search`
|
||||
|
||||
Artist index:
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:glob:
|
||||
|
||||
artists/*
|
||||
'''
|
||||
|
||||
# The content for the base conf.py generated.
|
||||
REST_CONF_TEMPLATE = u'''# -*- coding: utf-8 -*-
|
||||
master_doc = 'index'
|
||||
project = u'Lyrics'
|
||||
copyright = u'none'
|
||||
author = u'Various Authors'
|
||||
latex_documents = [
|
||||
(master_doc, 'Lyrics.tex', project,
|
||||
author, 'manual'),
|
||||
]
|
||||
epub_title = project
|
||||
epub_author = author
|
||||
epub_publisher = author
|
||||
epub_copyright = copyright
|
||||
epub_exclude_files = ['search.html']
|
||||
epub_tocdepth = 1
|
||||
epub_tocdup = False
|
||||
'''
|
||||
|
||||
|
||||
# Utilities.
|
||||
|
||||
def unichar(i):
|
||||
try:
|
||||
return six.unichr(i)
|
||||
except ValueError:
|
||||
return struct.pack('i', i).decode('utf-32')
|
||||
|
||||
|
||||
def unescape(text):
|
||||
"""Resolve &#xxx; HTML entities (and some others)."""
|
||||
if isinstance(text, bytes):
|
||||
text = text.decode('utf-8', 'ignore')
|
||||
out = text.replace(u' ', u' ')
|
||||
|
||||
def replchar(m):
|
||||
num = m.group(1)
|
||||
return unichar(int(num))
|
||||
out = re.sub(u"&#(\d+);", replchar, out)
|
||||
return out
|
||||
|
||||
|
||||
def extract_text_between(html, start_marker, end_marker):
|
||||
try:
|
||||
_, html = html.split(start_marker, 1)
|
||||
html, _ = html.split(end_marker, 1)
|
||||
except ValueError:
|
||||
return u''
|
||||
return html
|
||||
|
||||
|
||||
def extract_text_in(html, starttag):
|
||||
"""Extract the text from a <DIV> tag in the HTML starting with
|
||||
``starttag``. Returns None if parsing fails.
|
||||
"""
|
||||
# Strip off the leading text before opening tag.
|
||||
try:
|
||||
_, html = html.split(starttag, 1)
|
||||
except ValueError:
|
||||
return
|
||||
|
||||
# Walk through balanced DIV tags.
|
||||
level = 0
|
||||
parts = []
|
||||
pos = 0
|
||||
for match in DIV_RE.finditer(html):
|
||||
if match.group(1): # Closing tag.
|
||||
level -= 1
|
||||
if level == 0:
|
||||
pos = match.end()
|
||||
else: # Opening tag.
|
||||
if level == 0:
|
||||
parts.append(html[pos:match.start()])
|
||||
level += 1
|
||||
|
||||
if level == -1:
|
||||
parts.append(html[pos:match.start()])
|
||||
break
|
||||
else:
|
||||
print(u'no closing tag found!')
|
||||
return
|
||||
return u''.join(parts)
|
||||
|
||||
|
||||
def search_pairs(item):
|
||||
"""Yield a pairs of artists and titles to search for.
|
||||
|
||||
The first item in the pair is the name of the artist, the second
|
||||
item is a list of song names.
|
||||
|
||||
In addition to the artist and title obtained from the `item` the
|
||||
method tries to strip extra information like paranthesized suffixes
|
||||
and featured artists from the strings and add them as candidates.
|
||||
The method also tries to split multiple titles separated with `/`.
|
||||
"""
|
||||
def generate_alternatives(string, patterns):
|
||||
"""Generate string alternatives by extracting first matching group for
|
||||
each given pattern.
|
||||
"""
|
||||
alternatives = [string]
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, string, re.IGNORECASE)
|
||||
if match:
|
||||
alternatives.append(match.group(1))
|
||||
return alternatives
|
||||
|
||||
title, artist = item.title, item.artist
|
||||
|
||||
patterns = [
|
||||
# Remove any featuring artists from the artists name
|
||||
r"(.*?) {0}".format(plugins.feat_tokens())]
|
||||
artists = generate_alternatives(artist, patterns)
|
||||
|
||||
patterns = [
|
||||
# Remove a parenthesized suffix from a title string. Common
|
||||
# examples include (live), (remix), and (acoustic).
|
||||
r"(.+?)\s+[(].*[)]$",
|
||||
# Remove any featuring artists from the title
|
||||
r"(.*?) {0}".format(plugins.feat_tokens(for_artist=False)),
|
||||
# Remove part of title after colon ':' for songs with subtitles
|
||||
r"(.+?)\s*:.*"]
|
||||
titles = generate_alternatives(title, patterns)
|
||||
|
||||
# Check for a dual song (e.g. Pink Floyd - Speak to Me / Breathe)
|
||||
# and each of them.
|
||||
multi_titles = []
|
||||
for title in titles:
|
||||
multi_titles.append([title])
|
||||
if '/' in title:
|
||||
multi_titles.append([x.strip() for x in title.split('/')])
|
||||
|
||||
return itertools.product(artists, multi_titles)
|
||||
|
||||
|
||||
def slug(text):
|
||||
"""Make a URL-safe, human-readable version of the given text
|
||||
|
||||
This will do the following:
|
||||
|
||||
1. decode unicode characters into ASCII
|
||||
2. shift everything to lowercase
|
||||
3. strip whitespace
|
||||
4. replace other non-word characters with dashes
|
||||
5. strip extra dashes
|
||||
|
||||
This somewhat duplicates the :func:`Google.slugify` function but
|
||||
slugify is not as generic as this one, which can be reused
|
||||
elsewhere.
|
||||
"""
|
||||
return re.sub(r'\W+', '-', unidecode(text).lower().strip()).strip('-')
|
||||
|
||||
|
||||
class Backend(object):
|
||||
def __init__(self, config, log):
|
||||
self._log = log
|
||||
|
||||
@staticmethod
|
||||
def _encode(s):
|
||||
"""Encode the string for inclusion in a URL"""
|
||||
if isinstance(s, six.text_type):
|
||||
for char, repl in URL_CHARACTERS.items():
|
||||
s = s.replace(char, repl)
|
||||
s = s.encode('utf-8', 'ignore')
|
||||
return urllib.parse.quote(s)
|
||||
|
||||
def build_url(self, artist, title):
|
||||
return self.URL_PATTERN % (self._encode(artist.title()),
|
||||
self._encode(title.title()))
|
||||
|
||||
def fetch_url(self, url):
|
||||
"""Retrieve the content at a given URL, or return None if the source
|
||||
is unreachable.
|
||||
"""
|
||||
try:
|
||||
# Disable the InsecureRequestWarning that comes from using
|
||||
# `verify=false`.
|
||||
# https://github.com/kennethreitz/requests/issues/2214
|
||||
# We're not overly worried about the NSA MITMing our lyrics scraper
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter('ignore')
|
||||
r = requests.get(url, verify=False, headers={
|
||||
'User-Agent': USER_AGENT,
|
||||
})
|
||||
except requests.RequestException as exc:
|
||||
self._log.debug(u'lyrics request failed: {0}', exc)
|
||||
return
|
||||
if r.status_code == requests.codes.ok:
|
||||
return r.text
|
||||
else:
|
||||
self._log.debug(u'failed to fetch: {0} ({1})', url, r.status_code)
|
||||
|
||||
def fetch(self, artist, title):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class SymbolsReplaced(Backend):
|
||||
REPLACEMENTS = {
|
||||
r'\s+': '_',
|
||||
'<': 'Less_Than',
|
||||
'>': 'Greater_Than',
|
||||
'#': 'Number_',
|
||||
r'[\[\{]': '(',
|
||||
r'[\]\}]': ')',
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _encode(cls, s):
|
||||
for old, new in cls.REPLACEMENTS.items():
|
||||
s = re.sub(old, new, s)
|
||||
|
||||
return super(SymbolsReplaced, cls)._encode(s)
|
||||
|
||||
|
||||
class MusiXmatch(SymbolsReplaced):
|
||||
REPLACEMENTS = dict(SymbolsReplaced.REPLACEMENTS, **{
|
||||
r'\s+': '-'
|
||||
})
|
||||
|
||||
URL_PATTERN = 'https://www.musixmatch.com/lyrics/%s/%s'
|
||||
|
||||
def fetch(self, artist, title):
|
||||
url = self.build_url(artist, title)
|
||||
|
||||
html = self.fetch_url(url)
|
||||
if not html:
|
||||
return
|
||||
if "We detected that your IP is blocked" in html:
|
||||
self._log.warning(u'we are blocked at MusixMatch: url %s failed'
|
||||
% url)
|
||||
return
|
||||
html_part = html.split('<p class="mxm-lyrics__content')[-1]
|
||||
lyrics = extract_text_between(html_part, '>', '</p>')
|
||||
lyrics = lyrics.strip(',"').replace('\\n', '\n')
|
||||
# another odd case: sometimes only that string remains, for
|
||||
# missing songs. this seems to happen after being blocked
|
||||
# above, when filling in the CAPTCHA.
|
||||
if "Instant lyrics for all your music." in lyrics:
|
||||
return
|
||||
return lyrics
|
||||
|
||||
|
||||
class Genius(Backend):
|
||||
"""Fetch lyrics from Genius via genius-api.
|
||||
|
||||
Simply adapted from
|
||||
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
|
||||
"""
|
||||
|
||||
base_url = "https://api.genius.com"
|
||||
|
||||
def __init__(self, config, log):
|
||||
super(Genius, self).__init__(config, log)
|
||||
self.api_key = config['genius_api_key'].as_str()
|
||||
self.headers = {
|
||||
'Authorization': "Bearer %s" % self.api_key,
|
||||
'User-Agent': USER_AGENT,
|
||||
}
|
||||
|
||||
def lyrics_from_song_api_path(self, song_api_path):
|
||||
song_url = self.base_url + song_api_path
|
||||
response = requests.get(song_url, headers=self.headers)
|
||||
json = response.json()
|
||||
path = json["response"]["song"]["path"]
|
||||
|
||||
# Gotta go regular html scraping... come on Genius.
|
||||
page_url = "https://genius.com" + path
|
||||
try:
|
||||
page = requests.get(page_url)
|
||||
except requests.RequestException as exc:
|
||||
self._log.debug(u'Genius page request for {0} failed: {1}',
|
||||
page_url, exc)
|
||||
return None
|
||||
html = BeautifulSoup(page.text, "html.parser")
|
||||
|
||||
# Remove script tags that they put in the middle of the lyrics.
|
||||
[h.extract() for h in html('script')]
|
||||
|
||||
# At least Genius is nice and has a tag called 'lyrics'!
|
||||
# Updated css where the lyrics are based in HTML.
|
||||
lyrics = html.find("div", class_="lyrics").get_text()
|
||||
|
||||
return lyrics
|
||||
|
||||
def fetch(self, artist, title):
|
||||
search_url = self.base_url + "/search"
|
||||
data = {'q': title}
|
||||
try:
|
||||
response = requests.get(search_url, data=data,
|
||||
headers=self.headers)
|
||||
except requests.RequestException as exc:
|
||||
self._log.debug(u'Genius API request failed: {0}', exc)
|
||||
return None
|
||||
|
||||
try:
|
||||
json = response.json()
|
||||
except ValueError:
|
||||
self._log.debug(u'Genius API request returned invalid JSON')
|
||||
return None
|
||||
|
||||
song_info = None
|
||||
for hit in json["response"]["hits"]:
|
||||
if hit["result"]["primary_artist"]["name"] == artist:
|
||||
song_info = hit
|
||||
break
|
||||
|
||||
if song_info:
|
||||
song_api_path = song_info["result"]["api_path"]
|
||||
return self.lyrics_from_song_api_path(song_api_path)
|
||||
|
||||
|
||||
class LyricsWiki(SymbolsReplaced):
|
||||
"""Fetch lyrics from LyricsWiki."""
|
||||
|
||||
URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
|
||||
|
||||
def fetch(self, artist, title):
|
||||
url = self.build_url(artist, title)
|
||||
html = self.fetch_url(url)
|
||||
if not html:
|
||||
return
|
||||
|
||||
# Get the HTML fragment inside the appropriate HTML element and then
|
||||
# extract the text from it.
|
||||
html_frag = extract_text_in(html, u"<div class='lyricbox'>")
|
||||
if html_frag:
|
||||
lyrics = _scrape_strip_cruft(html_frag, True)
|
||||
|
||||
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
|
||||
return lyrics
|
||||
|
||||
|
||||
def remove_credits(text):
|
||||
"""Remove first/last line of text if it contains the word 'lyrics'
|
||||
eg 'Lyrics by songsdatabase.com'
|
||||
"""
|
||||
textlines = text.split('\n')
|
||||
credits = None
|
||||
for i in (0, -1):
|
||||
if textlines and 'lyrics' in textlines[i].lower():
|
||||
credits = textlines.pop(i)
|
||||
if credits:
|
||||
text = '\n'.join(textlines)
|
||||
return text
|
||||
|
||||
|
||||
def _scrape_strip_cruft(html, plain_text_out=False):
|
||||
"""Clean up HTML
|
||||
"""
|
||||
html = unescape(html)
|
||||
|
||||
html = html.replace('\r', '\n') # Normalize EOL.
|
||||
html = re.sub(r' +', ' ', html) # Whitespaces collapse.
|
||||
html = BREAK_RE.sub('\n', html) # <br> eats up surrounding '\n'.
|
||||
html = re.sub(r'<(script).*?</\1>(?s)', '', html) # Strip script tags.
|
||||
|
||||
if plain_text_out: # Strip remaining HTML tags
|
||||
html = COMMENT_RE.sub('', html)
|
||||
html = TAG_RE.sub('', html)
|
||||
|
||||
html = '\n'.join([x.strip() for x in html.strip().split('\n')])
|
||||
html = re.sub(r'\n{3,}', r'\n\n', html)
|
||||
return html
|
||||
|
||||
|
||||
def _scrape_merge_paragraphs(html):
|
||||
html = re.sub(r'</p>\s*<p(\s*[^>]*)>', '\n', html)
|
||||
return re.sub(r'<div .*>\s*</div>', '\n', html)
|
||||
|
||||
|
||||
def scrape_lyrics_from_html(html):
|
||||
"""Scrape lyrics from a URL. If no lyrics can be found, return None
|
||||
instead.
|
||||
"""
|
||||
if not HAS_BEAUTIFUL_SOUP:
|
||||
return None
|
||||
|
||||
if not html:
|
||||
return None
|
||||
|
||||
def is_text_notcode(text):
|
||||
length = len(text)
|
||||
return (length > 20 and
|
||||
text.count(' ') > length / 25 and
|
||||
(text.find('{') == -1 or text.find(';') == -1))
|
||||
html = _scrape_strip_cruft(html)
|
||||
html = _scrape_merge_paragraphs(html)
|
||||
|
||||
# extract all long text blocks that are not code
|
||||
try:
|
||||
soup = BeautifulSoup(html, "html.parser",
|
||||
parse_only=SoupStrainer(text=is_text_notcode))
|
||||
except HTMLParseError:
|
||||
return None
|
||||
|
||||
# Get the longest text element (if any).
|
||||
strings = sorted(soup.stripped_strings, key=len, reverse=True)
|
||||
if strings:
|
||||
return strings[0]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
class Google(Backend):
|
||||
"""Fetch lyrics from Google search results."""
|
||||
|
||||
def __init__(self, config, log):
|
||||
super(Google, self).__init__(config, log)
|
||||
self.api_key = config['google_API_key'].as_str()
|
||||
self.engine_id = config['google_engine_ID'].as_str()
|
||||
|
||||
def is_lyrics(self, text, artist=None):
|
||||
"""Determine whether the text seems to be valid lyrics.
|
||||
"""
|
||||
if not text:
|
||||
return False
|
||||
bad_triggers_occ = []
|
||||
nb_lines = text.count('\n')
|
||||
if nb_lines <= 1:
|
||||
self._log.debug(u"Ignoring too short lyrics '{0}'", text)
|
||||
return False
|
||||
elif nb_lines < 5:
|
||||
bad_triggers_occ.append('too_short')
|
||||
else:
|
||||
# Lyrics look legit, remove credits to avoid being penalized
|
||||
# further down
|
||||
text = remove_credits(text)
|
||||
|
||||
bad_triggers = ['lyrics', 'copyright', 'property', 'links']
|
||||
if artist:
|
||||
bad_triggers_occ += [artist]
|
||||
|
||||
for item in bad_triggers:
|
||||
bad_triggers_occ += [item] * len(re.findall(r'\W%s\W' % item,
|
||||
text, re.I))
|
||||
|
||||
if bad_triggers_occ:
|
||||
self._log.debug(u'Bad triggers detected: {0}', bad_triggers_occ)
|
||||
return len(bad_triggers_occ) < 2
|
||||
|
||||
def slugify(self, text):
|
||||
"""Normalize a string and remove non-alphanumeric characters.
|
||||
"""
|
||||
text = re.sub(r"[-'_\s]", '_', text)
|
||||
text = re.sub(r"_+", '_', text).strip('_')
|
||||
pat = "([^,\(]*)\((.*?)\)" # Remove content within parentheses
|
||||
text = re.sub(pat, '\g<1>', text).strip()
|
||||
try:
|
||||
text = unicodedata.normalize('NFKD', text).encode('ascii',
|
||||
'ignore')
|
||||
text = six.text_type(re.sub('[-\s]+', ' ', text.decode('utf-8')))
|
||||
except UnicodeDecodeError:
|
||||
self._log.exception(u"Failing to normalize '{0}'", text)
|
||||
return text
|
||||
|
||||
BY_TRANS = ['by', 'par', 'de', 'von']
|
||||
LYRICS_TRANS = ['lyrics', 'paroles', 'letras', 'liedtexte']
|
||||
|
||||
def is_page_candidate(self, url_link, url_title, title, artist):
|
||||
"""Return True if the URL title makes it a good candidate to be a
|
||||
page that contains lyrics of title by artist.
|
||||
"""
|
||||
title = self.slugify(title.lower())
|
||||
artist = self.slugify(artist.lower())
|
||||
sitename = re.search(u"//([^/]+)/.*",
|
||||
self.slugify(url_link.lower())).group(1)
|
||||
url_title = self.slugify(url_title.lower())
|
||||
|
||||
# Check if URL title contains song title (exact match)
|
||||
if url_title.find(title) != -1:
|
||||
return True
|
||||
|
||||
# or try extracting song title from URL title and check if
|
||||
# they are close enough
|
||||
tokens = [by + '_' + artist for by in self.BY_TRANS] + \
|
||||
[artist, sitename, sitename.replace('www.', '')] + \
|
||||
self.LYRICS_TRANS
|
||||
tokens = [re.escape(t) for t in tokens]
|
||||
song_title = re.sub(u'(%s)' % u'|'.join(tokens), u'', url_title)
|
||||
|
||||
song_title = song_title.strip('_|')
|
||||
typo_ratio = .9
|
||||
ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
|
||||
return ratio >= typo_ratio
|
||||
|
||||
def fetch(self, artist, title):
|
||||
query = u"%s %s" % (artist, title)
|
||||
url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' \
|
||||
% (self.api_key, self.engine_id,
|
||||
urllib.parse.quote(query.encode('utf-8')))
|
||||
|
||||
data = self.fetch_url(url)
|
||||
if not data:
|
||||
self._log.debug(u'google backend returned no data')
|
||||
return None
|
||||
try:
|
||||
data = json.loads(data)
|
||||
except ValueError as exc:
|
||||
self._log.debug(u'google backend returned malformed JSON: {}', exc)
|
||||
if 'error' in data:
|
||||
reason = data['error']['errors'][0]['reason']
|
||||
self._log.debug(u'google backend error: {0}', reason)
|
||||
return None
|
||||
|
||||
if 'items' in data.keys():
|
||||
for item in data['items']:
|
||||
url_link = item['link']
|
||||
url_title = item.get('title', u'')
|
||||
if not self.is_page_candidate(url_link, url_title,
|
||||
title, artist):
|
||||
continue
|
||||
html = self.fetch_url(url_link)
|
||||
lyrics = scrape_lyrics_from_html(html)
|
||||
if not lyrics:
|
||||
continue
|
||||
|
||||
if self.is_lyrics(lyrics, artist):
|
||||
self._log.debug(u'got lyrics from {0}',
|
||||
item['displayLink'])
|
||||
return lyrics
|
||||
|
||||
|
||||
class LyricsPlugin(plugins.BeetsPlugin):
|
||||
SOURCES = ['google', 'lyricwiki', 'musixmatch', 'genius']
|
||||
SOURCE_BACKENDS = {
|
||||
'google': Google,
|
||||
'lyricwiki': LyricsWiki,
|
||||
'musixmatch': MusiXmatch,
|
||||
'genius': Genius,
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
super(LyricsPlugin, self).__init__()
|
||||
self.import_stages = [self.imported]
|
||||
self.config.add({
|
||||
'auto': True,
|
||||
'bing_client_secret': None,
|
||||
'bing_lang_from': [],
|
||||
'bing_lang_to': None,
|
||||
'google_API_key': None,
|
||||
'google_engine_ID': u'009217259823014548361:lndtuqkycfu',
|
||||
'genius_api_key':
|
||||
"Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
|
||||
"76V-uFL5jks5dNvcGCdarqFjDhP9c",
|
||||
'fallback': None,
|
||||
'force': False,
|
||||
'local': False,
|
||||
'sources': self.SOURCES,
|
||||
})
|
||||
self.config['bing_client_secret'].redact = True
|
||||
self.config['google_API_key'].redact = True
|
||||
self.config['google_engine_ID'].redact = True
|
||||
self.config['genius_api_key'].redact = True
|
||||
|
||||
# State information for the ReST writer.
|
||||
# First, the current artist we're writing.
|
||||
self.artist = u'Unknown artist'
|
||||
# The current album: False means no album yet.
|
||||
self.album = False
|
||||
# The current rest file content. None means the file is not
|
||||
# open yet.
|
||||
self.rest = None
|
||||
|
||||
available_sources = list(self.SOURCES)
|
||||
sources = plugins.sanitize_choices(
|
||||
self.config['sources'].as_str_seq(), available_sources)
|
||||
|
||||
if 'google' in sources:
|
||||
if not self.config['google_API_key'].get():
|
||||
# We log a *debug* message here because the default
|
||||
# configuration includes `google`. This way, the source
|
||||
# is silent by default but can be enabled just by
|
||||
# setting an API key.
|
||||
self._log.debug(u'Disabling google source: '
|
||||
u'no API key configured.')
|
||||
sources.remove('google')
|
||||
elif not HAS_BEAUTIFUL_SOUP:
|
||||
self._log.warning(u'To use the google lyrics source, you must '
|
||||
u'install the beautifulsoup4 module. See '
|
||||
u'the documentation for further details.')
|
||||
sources.remove('google')
|
||||
|
||||
if 'genius' in sources and not HAS_BEAUTIFUL_SOUP:
|
||||
self._log.debug(
|
||||
u'The Genius backend requires BeautifulSoup, which is not '
|
||||
u'installed, so the source is disabled.'
|
||||
)
|
||||
sources.remove('genius')
|
||||
|
||||
self.config['bing_lang_from'] = [
|
||||
x.lower() for x in self.config['bing_lang_from'].as_str_seq()]
|
||||
self.bing_auth_token = None
|
||||
|
||||
if not HAS_LANGDETECT and self.config['bing_client_secret'].get():
|
||||
self._log.warning(u'To use bing translations, you need to '
|
||||
u'install the langdetect module. See the '
|
||||
u'documentation for further details.')
|
||||
|
||||
self.backends = [self.SOURCE_BACKENDS[source](self.config, self._log)
|
||||
for source in sources]
|
||||
|
||||
def get_bing_access_token(self):
|
||||
params = {
|
||||
'client_id': 'beets',
|
||||
'client_secret': self.config['bing_client_secret'],
|
||||
'scope': "https://api.microsofttranslator.com",
|
||||
'grant_type': 'client_credentials',
|
||||
}
|
||||
|
||||
oauth_url = 'https://datamarket.accesscontrol.windows.net/v2/OAuth2-13'
|
||||
oauth_token = json.loads(requests.post(
|
||||
oauth_url,
|
||||
data=urllib.parse.urlencode(params)).content)
|
||||
if 'access_token' in oauth_token:
|
||||
return "Bearer " + oauth_token['access_token']
|
||||
else:
|
||||
self._log.warning(u'Could not get Bing Translate API access token.'
|
||||
u' Check your "bing_client_secret" password')
|
||||
|
||||
def commands(self):
|
||||
cmd = ui.Subcommand('lyrics', help='fetch song lyrics')
|
||||
cmd.parser.add_option(
|
||||
u'-p', u'--print', dest='printlyr',
|
||||
action='store_true', default=False,
|
||||
help=u'print lyrics to console',
|
||||
)
|
||||
cmd.parser.add_option(
|
||||
u'-r', u'--write-rest', dest='writerest',
|
||||
action='store', default=None, metavar='dir',
|
||||
help=u'write lyrics to given directory as ReST files',
|
||||
)
|
||||
cmd.parser.add_option(
|
||||
u'-f', u'--force', dest='force_refetch',
|
||||
action='store_true', default=False,
|
||||
help=u'always re-download lyrics',
|
||||
)
|
||||
cmd.parser.add_option(
|
||||
u'-l', u'--local', dest='local_only',
|
||||
action='store_true', default=False,
|
||||
help=u'do not fetch missing lyrics',
|
||||
)
|
||||
|
||||
def func(lib, opts, args):
|
||||
# The "write to files" option corresponds to the
|
||||
# import_write config value.
|
||||
write = ui.should_write()
|
||||
if opts.writerest:
|
||||
self.writerest_indexes(opts.writerest)
|
||||
for item in lib.items(ui.decargs(args)):
|
||||
if not opts.local_only and not self.config['local']:
|
||||
self.fetch_item_lyrics(
|
||||
lib, item, write,
|
||||
opts.force_refetch or self.config['force'],
|
||||
)
|
||||
if item.lyrics:
|
||||
if opts.printlyr:
|
||||
ui.print_(item.lyrics)
|
||||
if opts.writerest:
|
||||
self.writerest(opts.writerest, item)
|
||||
if opts.writerest:
|
||||
# flush last artist
|
||||
self.writerest(opts.writerest, None)
|
||||
ui.print_(u'ReST files generated. to build, use one of:')
|
||||
ui.print_(u' sphinx-build -b html %s _build/html'
|
||||
% opts.writerest)
|
||||
ui.print_(u' sphinx-build -b epub %s _build/epub'
|
||||
% opts.writerest)
|
||||
ui.print_((u' sphinx-build -b latex %s _build/latex '
|
||||
u'&& make -C _build/latex all-pdf')
|
||||
% opts.writerest)
|
||||
cmd.func = func
|
||||
return [cmd]
|
||||
|
||||
def writerest(self, directory, item):
|
||||
"""Write the item to an ReST file
|
||||
|
||||
This will keep state (in the `rest` variable) in order to avoid
|
||||
writing continuously to the same files.
|
||||
"""
|
||||
|
||||
if item is None or slug(self.artist) != slug(item.albumartist):
|
||||
if self.rest is not None:
|
||||
path = os.path.join(directory, 'artists',
|
||||
slug(self.artist) + u'.rst')
|
||||
with open(path, 'wb') as output:
|
||||
output.write(self.rest.encode('utf-8'))
|
||||
self.rest = None
|
||||
if item is None:
|
||||
return
|
||||
self.artist = item.albumartist.strip()
|
||||
self.rest = u"%s\n%s\n\n.. contents::\n :local:\n\n" \
|
||||
% (self.artist,
|
||||
u'=' * len(self.artist))
|
||||
if self.album != item.album:
|
||||
tmpalbum = self.album = item.album.strip()
|
||||
if self.album == '':
|
||||
tmpalbum = u'Unknown album'
|
||||
self.rest += u"%s\n%s\n\n" % (tmpalbum, u'-' * len(tmpalbum))
|
||||
title_str = u":index:`%s`" % item.title.strip()
|
||||
block = u'| ' + item.lyrics.replace(u'\n', u'\n| ')
|
||||
self.rest += u"%s\n%s\n\n%s\n\n" % (title_str,
|
||||
u'~' * len(title_str),
|
||||
block)
|
||||
|
||||
def writerest_indexes(self, directory):
|
||||
"""Write conf.py and index.rst files necessary for Sphinx
|
||||
|
||||
We write minimal configurations that are necessary for Sphinx
|
||||
to operate. We do not overwrite existing files so that
|
||||
customizations are respected."""
|
||||
try:
|
||||
os.makedirs(os.path.join(directory, 'artists'))
|
||||
except OSError as e:
|
||||
if e.errno == errno.EEXIST:
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
indexfile = os.path.join(directory, 'index.rst')
|
||||
if not os.path.exists(indexfile):
|
||||
with open(indexfile, 'w') as output:
|
||||
output.write(REST_INDEX_TEMPLATE)
|
||||
conffile = os.path.join(directory, 'conf.py')
|
||||
if not os.path.exists(conffile):
|
||||
with open(conffile, 'w') as output:
|
||||
output.write(REST_CONF_TEMPLATE)
|
||||
|
||||
def imported(self, session, task):
|
||||
"""Import hook for fetching lyrics automatically.
|
||||
"""
|
||||
if self.config['auto']:
|
||||
for item in task.imported_items():
|
||||
self.fetch_item_lyrics(session.lib, item,
|
||||
False, self.config['force'])
|
||||
|
||||
def fetch_item_lyrics(self, lib, item, write, force):
|
||||
"""Fetch and store lyrics for a single item. If ``write``, then the
|
||||
lyrics will also be written to the file itself.
|
||||
"""
|
||||
# Skip if the item already has lyrics.
|
||||
if not force and item.lyrics:
|
||||
self._log.info(u'lyrics already present: {0}', item)
|
||||
return
|
||||
|
||||
lyrics = None
|
||||
for artist, titles in search_pairs(item):
|
||||
lyrics = [self.get_lyrics(artist, title) for title in titles]
|
||||
if any(lyrics):
|
||||
break
|
||||
|
||||
lyrics = u"\n\n---\n\n".join([l for l in lyrics if l])
|
||||
|
||||
if lyrics:
|
||||
self._log.info(u'fetched lyrics: {0}', item)
|
||||
if HAS_LANGDETECT and self.config['bing_client_secret'].get():
|
||||
lang_from = langdetect.detect(lyrics)
|
||||
if self.config['bing_lang_to'].get() != lang_from and (
|
||||
not self.config['bing_lang_from'] or (
|
||||
lang_from in self.config[
|
||||
'bing_lang_from'].as_str_seq())):
|
||||
lyrics = self.append_translation(
|
||||
lyrics, self.config['bing_lang_to'])
|
||||
else:
|
||||
self._log.info(u'lyrics not found: {0}', item)
|
||||
fallback = self.config['fallback'].get()
|
||||
if fallback:
|
||||
lyrics = fallback
|
||||
else:
|
||||
return
|
||||
item.lyrics = lyrics
|
||||
if write:
|
||||
item.try_write()
|
||||
item.store()
|
||||
|
||||
def get_lyrics(self, artist, title):
|
||||
"""Fetch lyrics, trying each source in turn. Return a string or
|
||||
None if no lyrics were found.
|
||||
"""
|
||||
for backend in self.backends:
|
||||
lyrics = backend.fetch(artist, title)
|
||||
if lyrics:
|
||||
self._log.debug(u'got lyrics from backend: {0}',
|
||||
backend.__class__.__name__)
|
||||
return _scrape_strip_cruft(lyrics, True)
|
||||
|
||||
def append_translation(self, text, to_lang):
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
if not self.bing_auth_token:
|
||||
self.bing_auth_token = self.get_bing_access_token()
|
||||
if self.bing_auth_token:
|
||||
# Extract unique lines to limit API request size per song
|
||||
text_lines = set(text.split('\n'))
|
||||
url = ('https://api.microsofttranslator.com/v2/Http.svc/'
|
||||
'Translate?text=%s&to=%s' % ('|'.join(text_lines), to_lang))
|
||||
r = requests.get(url,
|
||||
headers={"Authorization ": self.bing_auth_token})
|
||||
if r.status_code != 200:
|
||||
self._log.debug('translation API error {}: {}', r.status_code,
|
||||
r.text)
|
||||
if 'token has expired' in r.text:
|
||||
self.bing_auth_token = None
|
||||
return self.append_translation(text, to_lang)
|
||||
return text
|
||||
lines_translated = ET.fromstring(r.text.encode('utf-8')).text
|
||||
# Use a translation mapping dict to build resulting lyrics
|
||||
translations = dict(zip(text_lines, lines_translated.split('|')))
|
||||
result = ''
|
||||
for line in text.split('\n'):
|
||||
result += '%s / %s\n' % (line, translations[line])
|
||||
return result
|
Loading…
Add table
Add a link
Reference in a new issue