configs/arch-config/.config/calibre/metadata-sources-cache.json

18 lines
132 KiB
JSON
Raw Normal View History

2022-08-01 15:04:05 +02:00
{
"amazon": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai\n# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport re\nimport string\nimport socket\nimport time\nfrom functools import partial\ntry:\n from queue import Empty, Queue\nexcept ImportError:\n from Queue import Empty, Queue\nfrom threading import Thread\ntry:\n from urllib.parse import urlparse\nexcept ImportError:\n from urlparse import urlparse\n\nfrom mechanize import HTTPError\n\nfrom calibre import as_unicode, browser, random_user_agent, xml_replace_entities\nfrom calibre.ebooks.metadata import check_isbn\nfrom calibre.ebooks.metadata.book.base import Metadata\nfrom calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase\nfrom calibre.utils.localization import canonicalize_lang\nfrom calibre.utils.random_ua import accept_header_for_ua\nfrom calibre.ebooks.oeb.base import urlquote\n\n\ndef sort_matches_preferring_kindle_editions(matches):\n upos_map = {url:i for i, url in enumerate(matches)}\n\n def skey(url):\n opos = upos_map[url]\n parts = url.split('/')\n try:\n idx = parts.index('dp')\n except Exception:\n idx = -1\n if idx < 0 or idx + 1 >= len(parts) or not parts[idx+1].startswith('B'):\n return 1, opos\n return 0, opos\n matches.sort(key=skey)\n return matches\n\n\ndef iri_quote_plus(url):\n ans = urlquote(url)\n if isinstance(ans, bytes):\n ans = ans.decode('utf-8')\n return ans.replace('%20', '+')\n\n\ndef user_agent_is_ok(ua):\n return 'Mobile/' not in ua and 'Mobile ' not in ua\n\n\nclass CaptchaError(Exception):\n pass\n\n\nclass SearchFailed(ValueError):\n pass\n\n\ndef parse_html(raw):\n try:\n from html5_parser import parse\n except ImportError:\n # Old versions of calibre\n import html5lib\n return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)\n else:\n return parse(raw)\n\n\ndef parse_details_page(url, log, timeout, browser, domain):\n from calibre.utils.cleantext import clean_ascii_chars\n from calibre.ebooks.chardet import xml_to_unicode\n from lxml.html import tostring\n try:\n from calibre.ebooks.metadata.sources.update import search_engines_module\n get_data_for_cached_url = search_engines_module().get_data_for_cached_url\n except Exception:\n get_data_for_cached_url = lambda *a: None\n raw = get_data_for_cached_url(url)\n if raw:\n log('Using cached details for url:', url)\n else:\n log('Downloading details from:', url)\n try:\n raw = browser.open_novisit(url, timeout=timeout).read().strip()\n except Exception as e:\n if callable(getattr(e, 'getcode', None)) and \\\n e.getcode() == 404:\n log.error('URL malformed: %r' % url)\n return\n attr = getattr(e, 'args', [None])\n attr = attr if attr else [None]\n if isinstance(attr[0], socket.timeout):\n msg = 'Details page timed out. Try again later.'\n log.error(msg)\n else:\n msg = 'Failed to make details query: %r' % url\n log.exception(msg)\n return\n\n oraw = raw\n if 'amazon.com.br' in url:\n # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag\n raw = raw.decode('utf-8')\n raw = xml_to_unicode(raw, strip_encoding_pats=True,\n resolve_entities=True)[0]\n if '<title>404 - ' in raw:\n raise ValueError('URL malformed: %r' % url)\n if '>Could not find the requested document in the cache.<' in raw:\n raise ValueError('No cached entry for %s found' % url)\n\n try:\n root = parse_html(clean_ascii_chars(raw))\n except Exception:\n msg = 'Failed to parse amazon details page: %r' % url\n
"big_book_search": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__ = 'GPL v3'\n__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nfrom calibre.ebooks.metadata.sources.base import Source, Option\n\n\ndef get_urls(br, tokens):\n from urllib.parse import quote_plus\n from html5_parser import parse\n escaped = (quote_plus(x) for x in tokens if x and x.strip())\n q = '+'.join(escaped)\n url = 'https://bigbooksearch.com/please-dont-scrape-my-site-you-will-put-my-api-key-over-the-usage-limit-and-the-site-will-break/books/'+q\n raw = br.open(url).read()\n root = parse(raw.decode('utf-8'))\n urls = [i.get('src') for i in root.xpath('//img[@src]')]\n return urls\n\n\nclass BigBookSearch(Source):\n\n name = 'Big Book Search'\n version = (1, 0, 1)\n minimum_calibre_version = (2, 80, 0)\n description = _('Downloads multiple book covers from Amazon. Useful to find alternate covers.')\n capabilities = frozenset(['cover'])\n can_get_multiple_covers = True\n options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),\n _('The maximum number of covers to process from the search result')),\n )\n supports_gzip_transfer_encoding = True\n\n def download_cover(self, log, result_queue, abort,\n title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n if not title:\n return\n br = self.browser\n tokens = tuple(self.get_title_tokens(title)) + tuple(self.get_author_tokens(authors))\n urls = get_urls(br, tokens)\n self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)\n\n\ndef test():\n from calibre import browser\n import pprint\n br = browser()\n urls = get_urls(br, ['consider', 'phlebas', 'banks'])\n pprint.pprint(urls)\n\n\nif __name__ == '__main__':\n test()\n",
"edelweiss": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__ = 'GPL v3'\n__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nimport time, re\nfrom threading import Thread\ntry:\n from queue import Empty, Queue\nexcept ImportError:\n from Queue import Empty, Queue\n\nfrom calibre import as_unicode, random_user_agent\nfrom calibre.ebooks.metadata import check_isbn\nfrom calibre.ebooks.metadata.sources.base import Source\n\n\ndef clean_html(raw):\n from calibre.ebooks.chardet import xml_to_unicode\n from calibre.utils.cleantext import clean_ascii_chars\n return clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True,\n resolve_entities=True, assume_utf8=True)[0])\n\n\ndef parse_html(raw):\n raw = clean_html(raw)\n from html5_parser import parse\n return parse(raw)\n\n\ndef astext(node):\n from lxml import etree\n return etree.tostring(node, method='text', encoding='unicode',\n with_tail=False).strip()\n\n\nclass Worker(Thread): # {{{\n\n def __init__(self, basic_data, relevance, result_queue, br, timeout, log, plugin):\n Thread.__init__(self)\n self.daemon = True\n self.basic_data = basic_data\n self.br, self.log, self.timeout = br, log, timeout\n self.result_queue, self.plugin, self.sku = result_queue, plugin, self.basic_data['sku']\n self.relevance = relevance\n\n def run(self):\n url = ('https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/product/two_Enhanced.ascx&'\n 'sku={0}&idPrefix=content_1_{0}&mode=0'.format(self.sku))\n try:\n raw = self.br.open_novisit(url, timeout=self.timeout).read()\n except:\n self.log.exception('Failed to load comments page: %r'%url)\n return\n\n try:\n mi = self.parse(raw)\n mi.source_relevance = self.relevance\n self.plugin.clean_downloaded_metadata(mi)\n self.result_queue.put(mi)\n except:\n self.log.exception('Failed to parse details for sku: %s'%self.sku)\n\n def parse(self, raw):\n from calibre.ebooks.metadata.book.base import Metadata\n from calibre.utils.date import UNDEFINED_DATE\n root = parse_html(raw)\n mi = Metadata(self.basic_data['title'], self.basic_data['authors'])\n\n # Identifiers\n if self.basic_data['isbns']:\n mi.isbn = self.basic_data['isbns'][0]\n mi.set_identifier('edelweiss', self.sku)\n\n # Tags\n if self.basic_data['tags']:\n mi.tags = self.basic_data['tags']\n mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]\n\n # Publisher\n mi.publisher = self.basic_data['publisher']\n\n # Pubdate\n if self.basic_data['pubdate'] and self.basic_data['pubdate'].year != UNDEFINED_DATE:\n mi.pubdate = self.basic_data['pubdate']\n\n # Rating\n if self.basic_data['rating']:\n mi.rating = self.basic_data['rating']\n\n # Comments\n comments = ''\n for cid in ('summary', 'contributorbio', 'quotes_reviews'):\n cid = 'desc_{}{}-content'.format(cid, self.sku)\n div = root.xpath('//*[@id=\"{}\"]'.format(cid))\n if div:\n comments += self.render_comments(div[0])\n if comments:\n mi.comments = comments\n\n mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None\n return mi\n\n def render_comments(self, desc):\n from lxml import etree\n from calibre.library.comments import sanitize_comments_html\n for c in desc.xpath('descendant::noscript'):\n c.getparent().remove(c)\n for a in desc.xpath('descendant::a[@href]'):\n del a.attrib['href']\n a.tag = 'span'\n
"google": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai\n# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>\nfrom __future__ import absolute_import, division, print_function, unicode_literals\nimport hashlib\nimport os\nimport re\nimport regex\nimport sys\nimport tempfile\nimport time\n\ntry:\n from queue import Empty, Queue\nexcept ImportError:\n from Queue import Empty, Queue\n\nfrom calibre import as_unicode, replace_entities, prepare_string_for_xml\nfrom calibre.ebooks.chardet import xml_to_unicode\nfrom calibre.ebooks.metadata import authors_to_string, check_isbn\nfrom calibre.ebooks.metadata.book.base import Metadata\nfrom calibre.ebooks.metadata.sources.base import Source\nfrom calibre.utils.cleantext import clean_ascii_chars\nfrom calibre.utils.localization import canonicalize_lang\n\nNAMESPACES = {\n 'openSearch': 'http://a9.com/-/spec/opensearchrss/1.0/',\n 'atom': 'http://www.w3.org/2005/Atom',\n 'dc': 'http://purl.org/dc/terms',\n 'gd': 'http://schemas.google.com/g/2005'\n}\n\n\ndef pretty_google_books_comments(raw):\n raw = replace_entities(raw)\n # Paragraphs in the comments are removed but whatever software googl uses\n # to do this does not insert a space so we often find the pattern\n # word.Capital in the comments which can be used to find paragraph markers.\n parts = []\n for x in re.split(r'([a-z)\"”])(\\.)([A-Z(\"“])', raw):\n if x == '.':\n parts.append('.</p>\\n\\n<p>')\n else:\n parts.append(prepare_string_for_xml(x))\n raw = '<p>' + ''.join(parts) + '</p>'\n return raw\n\n\ndef get_details(browser, url, timeout): # {{{\n try:\n raw = browser.open_novisit(url, timeout=timeout).read()\n except Exception as e:\n gc = getattr(e, 'getcode', lambda: -1)\n if gc() != 403:\n raise\n # Google is throttling us, wait a little\n time.sleep(2)\n raw = browser.open_novisit(url, timeout=timeout).read()\n\n return raw\n\n\n# }}}\n\nxpath_cache = {}\n\n\ndef XPath(x):\n ans = xpath_cache.get(x)\n if ans is None:\n from lxml import etree\n ans = xpath_cache[x] = etree.XPath(x, namespaces=NAMESPACES)\n return ans\n\n\ndef to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{\n from lxml import etree\n\n # total_results = XPath('//openSearch:totalResults')\n # start_index = XPath('//openSearch:startIndex')\n # items_per_page = XPath('//openSearch:itemsPerPage')\n entry = XPath('//atom:entry')\n entry_id = XPath('descendant::atom:id')\n url = XPath('descendant::atom:link[@rel=\"self\"]/@href')\n creator = XPath('descendant::dc:creator')\n identifier = XPath('descendant::dc:identifier')\n title = XPath('descendant::dc:title')\n date = XPath('descendant::dc:date')\n publisher = XPath('descendant::dc:publisher')\n subject = XPath('descendant::dc:subject')\n description = XPath('descendant::dc:description')\n language = XPath('descendant::dc:language')\n\n # print(etree.tostring(entry_, pretty_print=True))\n\n def get_text(extra, x):\n try:\n ans = x(extra)\n if ans:\n ans = ans[0].text\n if ans and ans.strip():\n return ans.strip()\n except:\n log.exception('Programming error:')\n return None\n\n def get_extra_details():\n raw = get_details(browser, details_url, timeout)\n if running_a_test:\n with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f:\n f.write(raw)\n print('Book details saved to:', f.name, file=sys.stderr)\n feed = etree.fromstring(\n xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],\n parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)\n )\n return entry(feed)[0]\n\n if isinstance(entry_, str):\n google_id = e
"google_images": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__ = 'GPL v3'\n__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nfrom collections import OrderedDict\n\nfrom calibre import random_user_agent\nfrom calibre.ebooks.metadata.sources.base import Source, Option\n\n\ndef parse_html(raw):\n try:\n from html5_parser import parse\n except ImportError:\n # Old versions of calibre\n import html5lib\n return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)\n else:\n return parse(raw)\n\n\ndef imgurl_from_id(raw, tbnid):\n from json import JSONDecoder\n q = '\"{}\",['.format(tbnid)\n start_pos = raw.index(q)\n if start_pos < 100:\n return\n jd = JSONDecoder()\n data = jd.raw_decode('[' + raw[start_pos:])[0]\n # from pprint import pprint\n # pprint(data)\n url_num = 0\n for x in data:\n if isinstance(x, list) and len(x) == 3:\n q = x[0]\n if hasattr(q, 'lower') and q.lower().startswith('http'):\n url_num += 1\n if url_num > 1:\n return q\n\n\nclass GoogleImages(Source):\n\n name = 'Google Images'\n version = (1, 0, 3)\n minimum_calibre_version = (2, 80, 0)\n description = _('Downloads covers from a Google Image search. Useful to find larger/alternate covers.')\n capabilities = frozenset(['cover'])\n can_get_multiple_covers = True\n supports_gzip_transfer_encoding = True\n options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),\n _('The maximum number of covers to process from the Google search result')),\n Option('size', 'choices', 'svga', _('Cover size'),\n _('Search for covers larger than the specified size'),\n choices=OrderedDict((\n ('any', _('Any size'),),\n ('l', _('Large'),),\n ('qsvga', _('Larger than %s')%'400x300',),\n ('vga', _('Larger than %s')%'640x480',),\n ('svga', _('Larger than %s')%'600x800',),\n ('xga', _('Larger than %s')%'1024x768',),\n ('2mp', _('Larger than %s')%'2 MP',),\n ('4mp', _('Larger than %s')%'4 MP',),\n ))),\n )\n\n def download_cover(self, log, result_queue, abort,\n title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n if not title:\n return\n timeout = max(60, timeout) # Needs at least a minute\n title = ' '.join(self.get_title_tokens(title))\n author = ' '.join(self.get_author_tokens(authors))\n urls = self.get_image_urls(title, author, log, abort, timeout)\n self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)\n\n @property\n def user_agent(self):\n return random_user_agent(allow_ie=False)\n\n def get_image_urls(self, title, author, log, abort, timeout):\n from calibre.utils.cleantext import clean_ascii_chars\n try:\n from urllib.parse import urlencode\n except ImportError:\n from urllib import urlencode\n from collections import OrderedDict\n ans = OrderedDict()\n br = self.browser\n q = urlencode({'as_q': ('%s %s'%(title, author)).encode('utf-8')})\n if isinstance(q, bytes):\n q = q.decode('utf-8')\n sz = self.prefs['size']\n if sz == 'any':\n sz = ''\n elif sz == 'l':\n sz = 'isz:l,'\n else:\n sz = 'isz:lt,islt:%s,' % sz\n # See https://www.google.com/advanced_image_search to understand this\n # URL scheme\n url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq
"hashes": {
"amazon": "e2dc988ace313e5689de47912c4c1446e2311868",
"big_book_search": "9b468ff223b7ddbf6bbd26e58194c0fa8fbbfb69",
"edelweiss": "efd8f281a56435895b7916f85598d469bb53bb02",
"google": "bfe34ba5e1be36511f2228a350bd91c676724aac",
"google_images": "3323061049afe11a42422f325783de220991149d",
"openlibrary": "8707d3b3161de476b46ed967dea1116707dcfb0a",
"search_engines": "52a24c5004033fc9f393a260a09e6f8afb80b0ba"
},
"openlibrary": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__ = 'GPL v3'\n__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nfrom calibre.ebooks.metadata.sources.base import Source\n\n\nclass OpenLibrary(Source):\n\n name = 'Open Library'\n version = (1, 0, 0)\n minimum_calibre_version = (2, 80, 0)\n description = _('Downloads covers from The Open Library')\n\n capabilities = frozenset(['cover'])\n\n OPENLIBRARY = 'https://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'\n\n def download_cover(self, log, result_queue, abort,\n title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n if 'isbn' not in identifiers:\n return\n isbn = identifiers['isbn']\n br = self.browser\n try:\n ans = br.open_novisit(self.OPENLIBRARY%isbn, timeout=timeout).read()\n result_queue.put((self, ans))\n except Exception as e:\n if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:\n log.error('No cover for ISBN: %r found'%isbn)\n else:\n log.exception('Failed to download cover for ISBN:', isbn)\n",
"search_engines": "#!/usr/bin/env python\n# vim:fileencoding=utf-8\n# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>\n\nfrom __future__ import absolute_import, division, print_function, unicode_literals\nimport json\nimport os\nimport re\nimport time\nfrom collections import namedtuple\nfrom contextlib import contextmanager\nfrom threading import Lock\n\ntry:\n from urllib.parse import parse_qs, quote_plus, unquote, urlencode, quote\nexcept ImportError:\n from urlparse import parse_qs\n from urllib import quote_plus, urlencode, unquote, quote\n\nfrom lxml import etree\n\nfrom calibre import browser as _browser, prints, random_user_agent\nfrom calibre.constants import cache_dir\nfrom calibre.ebooks.chardet import xml_to_unicode\nfrom calibre.utils.lock import ExclusiveFile\nfrom calibre.utils.random_ua import accept_header_for_ua\n\ncurrent_version = (1, 2, 0)\nminimum_calibre_version = (2, 80, 0)\nwebcache = {}\nwebcache_lock = Lock()\n\n\nResult = namedtuple('Result', 'url title cached_url')\n\n\n@contextmanager\ndef rate_limit(name='test', time_between_visits=2, max_wait_seconds=5 * 60, sleep_time=0.2):\n lock_file = os.path.join(cache_dir(), 'search-engine.' + name + '.lock')\n with ExclusiveFile(lock_file, timeout=max_wait_seconds, sleep_time=sleep_time) as f:\n try:\n lv = float(f.read().decode('utf-8').strip())\n except Exception:\n lv = 0\n # we cannot use monotonic() as this is cross process and historical\n # data as well\n delta = time.time() - lv\n if delta < time_between_visits:\n time.sleep(time_between_visits - delta)\n try:\n yield\n finally:\n f.seek(0)\n f.truncate()\n f.write(repr(time.time()).encode('utf-8'))\n\n\ndef tostring(elem):\n return etree.tostring(elem, encoding='unicode', method='text', with_tail=False)\n\n\ndef browser():\n ua = random_user_agent(allow_ie=False)\n # ua = 'Mozilla/5.0 (Linux; Android 8.0.0; VTR-L29; rv:63.0) Gecko/20100101 Firefox/63.0'\n br = _browser(user_agent=ua)\n br.set_handle_gzip(True)\n br.addheaders += [\n ('Accept', accept_header_for_ua(ua)),\n ('Upgrade-insecure-requests', '1'),\n ]\n return br\n\n\ndef encode_query(**query):\n q = {k.encode('utf-8'): v.encode('utf-8') for k, v in query.items()}\n return urlencode(q).decode('utf-8')\n\n\ndef parse_html(raw):\n try:\n from html5_parser import parse\n except ImportError:\n # Old versions of calibre\n import html5lib\n return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)\n else:\n return parse(raw)\n\n\ndef query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None, simple_scraper=None):\n with rate_limit(key):\n if simple_scraper is None:\n raw = br.open_novisit(url, timeout=timeout).read()\n raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]\n else:\n raw = simple_scraper(url, timeout=timeout)\n if dump_raw is not None:\n with open(dump_raw, 'w') as f:\n f.write(raw)\n if save_raw is not None:\n save_raw(raw)\n return parser(raw)\n\n\ndef quote_term(x):\n ans = quote_plus(x.encode('utf-8'))\n if isinstance(ans, bytes):\n ans = ans.decode('utf-8')\n return ans\n\n\n# DDG + Wayback machine {{{\n\n\ndef ddg_url_processor(url):\n return url\n\n\ndef ddg_term(t):\n t = t.replace('\"', '')\n if t.lower() in {'map', 'news'}:\n t = '\"' + t + '\"'\n if t in {'OR', 'AND', 'NOT'}:\n t = t.lower()\n return t\n\n\ndef ddg_href(url):\n if url.startswith('/'):\n q = url.partition('?')[2]\n url = parse_qs(q.encode('utf-8'))['uddg'][0].decode('utf-8')\n return url\n\n\ndef wayback_machine_cached_url(url, br=None, log=prints, timeout=60):\n q = quote_term(url)\n br = br or browser()\n data = query(br, 'https://archive.org/wayback/available?url=' +\n
}