configs/arch-config/.config/calibre/metadata-sources-cache.json

18 lines
132 KiB
JSON
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"amazon": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai\n# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport re\nimport string\nimport socket\nimport time\nfrom functools import partial\ntry:\n from queue import Empty, Queue\nexcept ImportError:\n from Queue import Empty, Queue\nfrom threading import Thread\ntry:\n from urllib.parse import urlparse\nexcept ImportError:\n from urlparse import urlparse\n\nfrom mechanize import HTTPError\n\nfrom calibre import as_unicode, browser, random_user_agent, xml_replace_entities\nfrom calibre.ebooks.metadata import check_isbn\nfrom calibre.ebooks.metadata.book.base import Metadata\nfrom calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase\nfrom calibre.utils.localization import canonicalize_lang\nfrom calibre.utils.random_ua import accept_header_for_ua\nfrom calibre.ebooks.oeb.base import urlquote\n\n\ndef sort_matches_preferring_kindle_editions(matches):\n upos_map = {url:i for i, url in enumerate(matches)}\n\n def skey(url):\n opos = upos_map[url]\n parts = url.split('/')\n try:\n idx = parts.index('dp')\n except Exception:\n idx = -1\n if idx < 0 or idx + 1 >= len(parts) or not parts[idx+1].startswith('B'):\n return 1, opos\n return 0, opos\n matches.sort(key=skey)\n return matches\n\n\ndef iri_quote_plus(url):\n ans = urlquote(url)\n if isinstance(ans, bytes):\n ans = ans.decode('utf-8')\n return ans.replace('%20', '+')\n\n\ndef user_agent_is_ok(ua):\n return 'Mobile/' not in ua and 'Mobile ' not in ua\n\n\nclass CaptchaError(Exception):\n pass\n\n\nclass SearchFailed(ValueError):\n pass\n\n\ndef parse_html(raw):\n try:\n from html5_parser import parse\n except ImportError:\n # Old versions of calibre\n import html5lib\n return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)\n else:\n return parse(raw)\n\n\ndef parse_details_page(url, log, timeout, browser, domain):\n from calibre.utils.cleantext import clean_ascii_chars\n from calibre.ebooks.chardet import xml_to_unicode\n from lxml.html import tostring\n try:\n from calibre.ebooks.metadata.sources.update import search_engines_module\n get_data_for_cached_url = search_engines_module().get_data_for_cached_url\n except Exception:\n get_data_for_cached_url = lambda *a: None\n raw = get_data_for_cached_url(url)\n if raw:\n log('Using cached details for url:', url)\n else:\n log('Downloading details from:', url)\n try:\n raw = browser.open_novisit(url, timeout=timeout).read().strip()\n except Exception as e:\n if callable(getattr(e, 'getcode', None)) and \\\n e.getcode() == 404:\n log.error('URL malformed: %r' % url)\n return\n attr = getattr(e, 'args', [None])\n attr = attr if attr else [None]\n if isinstance(attr[0], socket.timeout):\n msg = 'Details page timed out. Try again later.'\n log.error(msg)\n else:\n msg = 'Failed to make details query: %r' % url\n log.exception(msg)\n return\n\n oraw = raw\n if 'amazon.com.br' in url:\n # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag\n raw = raw.decode('utf-8')\n raw = xml_to_unicode(raw, strip_encoding_pats=True,\n resolve_entities=True)[0]\n if '<title>404 - ' in raw:\n raise ValueError('URL malformed: %r' % url)\n if '>Could not find the requested document in the cache.<' in raw:\n raise ValueError('No cached entry for %s found' % url)\n\n try:\n root = parse_html(clean_ascii_chars(raw))\n except Exception:\n msg = 'Failed to parse amazon details page: %r' % url\n log.exception(msg)\n return\n if domain == 'jp':\n for a in root.xpath('//a[@href]'):\n if ('black-curtain-redirect.html' in a.get('href')) or ('/black-curtain/save-eligibility/black-curtain' in a.get('href')):\n url = a.get('href')\n if url:\n if url.startswith('/'):\n url = 'https://amazon.co.jp' + a.get('href')\n log('Black curtain redirect found, following')\n return parse_details_page(url, log, timeout, browser, domain)\n\n errmsg = root.xpath('//*[@id=\"errorMessage\"]')\n if errmsg:\n msg = 'Failed to parse amazon details page: %r' % url\n msg += tostring(errmsg, method='text', encoding='unicode').strip()\n log.error(msg)\n return\n\n from css_selectors import Select\n selector = Select(root)\n return oraw, root, selector\n\n\ndef parse_asin(root, log, url):\n try:\n link = root.xpath('//link[@rel=\"canonical\" and @href]')\n for l in link:\n return l.get('href').rpartition('/')[-1]\n except Exception:\n log.exception('Error parsing ASIN for url: %r' % url)\n\n\nclass Worker(Thread): # Get details {{{\n\n '''\n Get book details from amazons book page in a separate thread\n '''\n\n def __init__(self, url, result_queue, browser, log, relevance, domain,\n plugin, timeout=20, testing=False, preparsed_root=None,\n cover_url_processor=None, filter_result=None):\n Thread.__init__(self)\n self.cover_url_processor = cover_url_processor\n self.preparsed_root = preparsed_root\n self.daemon = True\n self.testing = testing\n self.url, self.result_queue = url, result_queue\n self.log, self.timeout = log, timeout\n self.filter_result = filter_result or (lambda x, log: True)\n self.relevance, self.plugin = relevance, plugin\n self.browser = browser\n self.cover_url = self.amazon_id = self.isbn = None\n self.domain = domain\n from lxml.html import tostring\n self.tostring = tostring\n\n months = { # {{{\n 'de': {\n 1: ['jän', 'januar'],\n 2: ['februar'],\n 3: ['märz'],\n 5: ['mai'],\n 6: ['juni'],\n 7: ['juli'],\n 10: ['okt', 'oktober'],\n 12: ['dez', 'dezember']\n },\n 'it': {\n 1: ['gennaio', 'enn'],\n 2: ['febbraio', 'febbr'],\n 3: ['marzo'],\n 4: ['aprile'],\n 5: ['maggio', 'magg'],\n 6: ['giugno'],\n 7: ['luglio'],\n 8: ['agosto', 'ag'],\n 9: ['settembre', 'sett'],\n 10: ['ottobre', 'ott'],\n 11: ['novembre'],\n 12: ['dicembre', 'dic'],\n },\n 'fr': {\n 1: ['janv'],\n 2: ['févr'],\n 3: ['mars'],\n 4: ['avril'],\n 5: ['mai'],\n 6: ['juin'],\n 7: ['juil'],\n 8: ['août'],\n 9: ['sept'],\n 12: ['déc'],\n },\n 'br': {\n 1: ['janeiro'],\n 2: ['fevereiro'],\n 3: ['março'],\n 4: ['abril'],\n 5: ['maio'],\n 6: ['junho'],\n 7: ['julho'],\n 8: ['agosto'],\n 9: ['setembro'],\n 10: ['outubro'],\n 11: ['novembro'],\n 12: ['dezembro'],\n },\n 'es': {\n 1: ['enero'],\n 2: ['febrero'],\n 3: ['marzo'],\n 4: ['abril'],\n 5: ['mayo'],\n 6: ['junio'],\n 7: ['julio'],\n 8: ['agosto'],\n 9: ['septiembre', 'setiembre'],\n 10: ['octubre'],\n 11: ['noviembre'],\n 12: ['diciembre'],\n },\n 'se': {\n 1: ['januari'],\n 2: ['februari'],\n 3: ['mars'],\n 4: ['april'],\n 5: ['maj'],\n 6: ['juni'],\n 7: ['juli'],\n 8: ['augusti'],\n 9: ['september'],\n 10: ['oktober'],\n 11: ['november'],\n 12: ['december'],\n },\n 'jp': {\n 1: ['1月'],\n 2: ['2月'],\n 3: ['3月'],\n 4: ['4月'],\n 5: ['5月'],\n 6: ['6月'],\n 7: ['7月'],\n 8: ['8月'],\n 9: ['9月'],\n 10: ['10月'],\n 11: ['11月'],\n 12: ['12月'],\n },\n 'nl': {\n 1: ['januari'], 2: ['februari'], 3: ['maart'], 5: ['mei'], 6: ['juni'], 7: ['juli'], 8: ['augustus'], 10: ['oktober'],\n }\n\n } # }}}\n\n self.english_months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',\n 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']\n self.months = months.get(self.domain, {})\n\n self.pd_xpath = '''\n //h2[text()=\"Product Details\" or \\\n text()=\"Produktinformation\" or \\\n text()=\"Dettagli prodotto\" or \\\n text()=\"Product details\" or \\\n text()=\"Détails sur le produit\" or \\\n text()=\"Detalles del producto\" or \\\n text()=\"Detalhes do produto\" or \\\n text()=\"Productgegevens\" or \\\n text()=\"基本信息\" or \\\n starts-with(text(), \"登録情報\")]/../div[@class=\"content\"]\n '''\n # Editor: is for Spanish\n self.publisher_xpath = '''\n descendant::*[starts-with(text(), \"Publisher:\") or \\\n starts-with(text(), \"Verlag:\") or \\\n starts-with(text(), \"Editore:\") or \\\n starts-with(text(), \"Editeur\") or \\\n starts-with(text(), \"Editor:\") or \\\n starts-with(text(), \"Editora:\") or \\\n starts-with(text(), \"Uitgever:\") or \\\n starts-with(text(), \"Utgivare:\") or \\\n starts-with(text(), \"出版社:\")]\n '''\n self.pubdate_xpath = '''\n descendant::*[starts-with(text(), \"Publication Date:\") or \\\n starts-with(text(), \"Audible.com Release Date:\")]\n '''\n self.publisher_names = {'Publisher', 'Uitgever', 'Verlag', 'Utgivare', 'Herausgeber',\n 'Editore', 'Editeur', 'Editor', 'Editora', '出版社'}\n\n self.language_xpath = '''\n descendant::*[\n starts-with(text(), \"Language:\") \\\n or text() = \"Language\" \\\n or text() = \"Sprache:\" \\\n or text() = \"Lingua:\" \\\n or text() = \"Idioma:\" \\\n or starts-with(text(), \"Langue\") \\\n or starts-with(text(), \"言語\") \\\n or starts-with(text(), \"Språk\") \\\n or starts-with(text(), \"语种\")\n ]\n '''\n self.language_names = {'Language', 'Sprache', 'Språk',\n 'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'}\n\n self.tags_xpath = '''\n descendant::h2[\n text() = \"Look for Similar Items by Category\" or\n text() = \"Ähnliche Artikel finden\" or\n text() = \"Buscar productos similares por categoría\" or\n text() = \"Ricerca articoli simili per categoria\" or\n text() = \"Rechercher des articles similaires par rubrique\" or\n text() = \"Procure por items similares por categoria\" or\n text() = \"関連商品を探す\"\n ]/../descendant::ul/li\n '''\n\n self.ratings_pat = re.compile(\n r'([0-9.,]+) ?(out of|von|van|su|étoiles sur|つ星のうち|de un máximo de|de|av) '\n r'([\\d\\.]+)( (stars|Sternen|stelle|estrellas|estrelas|sterren|stjärnor)){0,1}'\n )\n self.ratings_pat_cn = re.compile('([0-9.]+) 颗星,最多 5 颗星')\n self.ratings_pat_jp = re.compile(r'\\d+つ星のうち([\\d\\.]+)')\n\n lm = {\n 'eng': ('English', 'Englisch', 'Engels', 'Engelska'),\n 'fra': ('French', 'Français'),\n 'ita': ('Italian', 'Italiano'),\n 'deu': ('German', 'Deutsch'),\n 'spa': ('Spanish', 'Espa\\xf1ol', 'Espaniol'),\n 'jpn': ('Japanese', '日本語'),\n 'por': ('Portuguese', 'Português'),\n 'nld': ('Dutch', 'Nederlands',),\n 'chs': ('Chinese', '中文', '简体中文'),\n 'swe': ('Swedish', 'Svenska'),\n }\n self.lang_map = {}\n for code, names in lm.items():\n for name in names:\n self.lang_map[name] = code\n\n self.series_pat = re.compile(\n r'''\n \\|\\s* # Prefix\n (Series)\\s*:\\s* # Series declaration\n (?P<series>.+?)\\s+ # The series name\n \\((Book)\\s* # Book declaration\n (?P<index>[0-9.]+) # Series index\n \\s*\\)\n ''', re.X)\n\n def delocalize_datestr(self, raw):\n if self.domain == 'cn':\n return raw.replace('年', '-').replace('月', '-').replace('日', '')\n if not self.months:\n return raw\n ans = raw.lower()\n for i, vals in self.months.items():\n for x in vals:\n ans = ans.replace(x, self.english_months[i])\n ans = ans.replace(' de ', ' ')\n return ans\n\n def run(self):\n try:\n self.get_details()\n except:\n self.log.exception('get_details failed for url: %r' % self.url)\n\n def get_details(self):\n if self.preparsed_root is None:\n raw, root, selector = parse_details_page(\n self.url, self.log, self.timeout, self.browser, self.domain)\n else:\n raw, root, selector = self.preparsed_root\n\n from css_selectors import Select\n self.selector = Select(root)\n self.parse_details(raw, root)\n\n def parse_details(self, raw, root):\n asin = parse_asin(root, self.log, self.url)\n if not asin and root.xpath('//form[@action=\"/errors/validateCaptcha\"]'):\n raise CaptchaError(\n 'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')\n if self.testing:\n import tempfile\n import uuid\n with tempfile.NamedTemporaryFile(prefix=(asin or type('')(uuid.uuid4())) + '_',\n suffix='.html', delete=False) as f:\n f.write(raw)\n print('Downloaded HTML for', asin, 'saved in', f.name)\n\n try:\n title = self.parse_title(root)\n except:\n self.log.exception('Error parsing title for url: %r' % self.url)\n title = None\n\n try:\n authors = self.parse_authors(root)\n except:\n self.log.exception('Error parsing authors for url: %r' % self.url)\n authors = []\n\n if not title or not authors or not asin:\n self.log.error(\n 'Could not find title/authors/asin for %r' % self.url)\n self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title,\n authors))\n return\n\n mi = Metadata(title, authors)\n idtype = 'amazon' if self.domain == 'com' else 'amazon_' + self.domain\n mi.set_identifier(idtype, asin)\n self.amazon_id = asin\n\n try:\n mi.rating = self.parse_rating(root)\n except:\n self.log.exception('Error parsing ratings for url: %r' % self.url)\n\n try:\n mi.comments = self.parse_comments(root, raw)\n except:\n self.log.exception('Error parsing comments for url: %r' % self.url)\n\n try:\n series, series_index = self.parse_series(root)\n if series:\n mi.series, mi.series_index = series, series_index\n elif self.testing:\n mi.series, mi.series_index = 'Dummy series for testing', 1\n except:\n self.log.exception('Error parsing series for url: %r' % self.url)\n\n try:\n mi.tags = self.parse_tags(root)\n except:\n self.log.exception('Error parsing tags for url: %r' % self.url)\n\n try:\n self.cover_url = self.parse_cover(root, raw)\n except:\n self.log.exception('Error parsing cover for url: %r' % self.url)\n if self.cover_url_processor is not None and self.cover_url and self.cover_url.startswith('/'):\n self.cover_url = self.cover_url_processor(self.cover_url)\n mi.has_cover = bool(self.cover_url)\n\n detail_bullets = root.xpath('//*[@data-feature-name=\"detailBullets\"]')\n non_hero = tuple(self.selector(\n 'div#bookDetails_container_div div#nonHeroSection')) or tuple(self.selector(\n '#productDetails_techSpec_sections'))\n if detail_bullets:\n self.parse_detail_bullets(root, mi, detail_bullets[0])\n elif non_hero:\n try:\n self.parse_new_details(root, mi, non_hero[0])\n except:\n self.log.exception(\n 'Failed to parse new-style book details section')\n\n else:\n pd = root.xpath(self.pd_xpath)\n if pd:\n pd = pd[0]\n\n try:\n isbn = self.parse_isbn(pd)\n if isbn:\n self.isbn = mi.isbn = isbn\n except:\n self.log.exception(\n 'Error parsing ISBN for url: %r' % self.url)\n\n try:\n mi.publisher = self.parse_publisher(pd)\n except:\n self.log.exception(\n 'Error parsing publisher for url: %r' % self.url)\n\n try:\n mi.pubdate = self.parse_pubdate(pd)\n except:\n self.log.exception(\n 'Error parsing publish date for url: %r' % self.url)\n\n try:\n lang = self.parse_language(pd)\n if lang:\n mi.language = lang\n except:\n self.log.exception(\n 'Error parsing language for url: %r' % self.url)\n\n else:\n self.log.warning(\n 'Failed to find product description for url: %r' % self.url)\n\n mi.source_relevance = self.relevance\n\n if self.amazon_id:\n if self.isbn:\n self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id)\n if self.cover_url:\n self.plugin.cache_identifier_to_cover_url(self.amazon_id,\n self.cover_url)\n\n self.plugin.clean_downloaded_metadata(mi)\n\n if self.filter_result(mi, self.log):\n self.result_queue.put(mi)\n\n def totext(self, elem, only_printable=False):\n res = self.tostring(elem, encoding='unicode', method='text')\n if only_printable:\n try:\n filtered_characters = list(s for s in res if s.isprintable())\n except AttributeError:\n filtered_characters = list(s for s in res if s in string.printable)\n res = ''.join(filtered_characters).strip()\n return res\n\n def parse_title(self, root):\n\n def sanitize_title(title):\n ans = title.strip()\n if not ans.startswith('['):\n ans = re.sub(r'[(\\[].*[)\\]]', '', title).strip()\n return ans\n\n h1 = root.xpath('//h1[@id=\"title\"]')\n if h1:\n h1 = h1[0]\n for child in h1.xpath('./*[contains(@class, \"a-color-secondary\")]'):\n h1.remove(child)\n return sanitize_title(self.totext(h1))\n tdiv = root.xpath('//h1[contains(@class, \"parseasinTitle\")]')\n if not tdiv:\n span = root.xpath('//*[@id=\"ebooksTitle\"]')\n if span:\n return sanitize_title(self.totext(span[0]))\n raise ValueError('No title block found')\n tdiv = tdiv[0]\n actual_title = tdiv.xpath('descendant::*[@id=\"btAsinTitle\"]')\n if actual_title:\n title = self.tostring(actual_title[0], encoding='unicode',\n method='text').strip()\n else:\n title = self.tostring(tdiv, encoding='unicode',\n method='text').strip()\n return sanitize_title(title)\n\n def parse_authors(self, root):\n for sel in (\n '#byline .author .contributorNameID',\n '#byline .author a.a-link-normal',\n '#bylineInfo .author .contributorNameID',\n '#bylineInfo .author a.a-link-normal',\n '#bylineInfo #bylineContributor',\n ):\n matches = tuple(self.selector(sel))\n if matches:\n authors = [self.totext(x) for x in matches]\n return [a for a in authors if a]\n\n x = '//h1[contains(@class, \"parseasinTitle\")]/following-sibling::span/*[(name()=\"a\" and @href) or (name()=\"span\" and @class=\"contributorNameTrigger\")]'\n aname = root.xpath(x)\n if not aname:\n aname = root.xpath('''\n //h1[contains(@class, \"parseasinTitle\")]/following-sibling::*[(name()=\"a\" and @href) or (name()=\"span\" and @class=\"contributorNameTrigger\")]\n ''')\n for x in aname:\n x.tail = ''\n authors = [self.tostring(x, encoding='unicode', method='text').strip() for x\n in aname]\n authors = [a for a in authors if a]\n return authors\n\n def parse_rating(self, root):\n for x in root.xpath('//div[@id=\"cpsims-feature\" or @id=\"purchase-sims-feature\" or @id=\"rhf\"]'):\n # Remove the similar books section as it can cause spurious\n # ratings matches\n x.getparent().remove(x)\n\n rating_paths = (\n '//div[@data-feature-name=\"averageCustomerReviews\" or @id=\"averageCustomerReviews\"]',\n '//div[@class=\"jumpBar\"]/descendant::span[contains(@class,\"asinReviewsSummary\")]',\n '//div[@class=\"buying\"]/descendant::span[contains(@class,\"asinReviewsSummary\")]',\n '//span[@class=\"crAvgStars\"]/descendant::span[contains(@class,\"asinReviewsSummary\")]'\n )\n ratings = None\n for p in rating_paths:\n ratings = root.xpath(p)\n if ratings:\n break\n\n def parse_ratings_text(text):\n try:\n m = self.ratings_pat.match(text)\n return float(m.group(1).replace(',', '.')) / float(m.group(3)) * 5\n except Exception:\n pass\n\n if ratings:\n ratings = ratings[0]\n for elem in ratings.xpath('descendant::*[@title]'):\n t = elem.get('title').strip()\n if self.domain == 'cn':\n m = self.ratings_pat_cn.match(t)\n if m is not None:\n return float(m.group(1))\n elif self.domain == 'jp':\n m = self.ratings_pat_jp.match(t)\n if m is not None:\n return float(m.group(1))\n else:\n ans = parse_ratings_text(t)\n if ans is not None:\n return ans\n for elem in ratings.xpath('descendant::span[@class=\"a-icon-alt\"]'):\n t = self.tostring(\n elem, encoding='unicode', method='text', with_tail=False).strip()\n ans = parse_ratings_text(t)\n if ans is not None:\n return ans\n\n def _render_comments(self, desc):\n from calibre.library.comments import sanitize_comments_html\n\n for c in desc.xpath('descendant::noscript'):\n c.getparent().remove(c)\n for c in desc.xpath('descendant::*[@class=\"seeAll\" or'\n ' @class=\"emptyClear\" or @id=\"collapsePS\" or'\n ' @id=\"expandPS\"]'):\n c.getparent().remove(c)\n for b in desc.xpath('descendant::b[@style]'):\n # Bing highlights search results\n s = b.get('style', '')\n if 'color' in s:\n b.tag = 'span'\n del b.attrib['style']\n\n for a in desc.xpath('descendant::a[@href]'):\n del a.attrib['href']\n a.tag = 'span'\n desc = self.tostring(desc, method='html', encoding='unicode').strip()\n desc = xml_replace_entities(desc, 'utf-8')\n\n # Encoding bug in Amazon data U+fffd (replacement char)\n # in some examples it is present in place of '\n desc = desc.replace('\\ufffd', \"'\")\n # remove all attributes from tags\n desc = re.sub(r'<([a-zA-Z0-9]+)\\s[^>]+>', r'<\\1>', desc)\n # Collapse whitespace\n # desc = re.sub('\\n+', '\\n', desc)\n # desc = re.sub(' +', ' ', desc)\n # Remove the notice about text referring to out of print editions\n desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)\n # Remove comments\n desc = re.sub(r'(?s)<!--.*?-->', '', desc)\n return sanitize_comments_html(desc)\n\n def parse_comments(self, root, raw):\n try:\n from urllib.parse import unquote\n except ImportError:\n from urllib import unquote\n ans = ''\n ns = tuple(self.selector('#bookDescription_feature_div noscript'))\n if ns:\n ns = ns[0]\n if len(ns) == 0 and ns.text:\n import html5lib\n # html5lib parsed noscript as CDATA\n ns = html5lib.parseFragment(\n '<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]\n else:\n ns.tag = 'div'\n ans = self._render_comments(ns)\n else:\n desc = root.xpath('//div[@id=\"ps-content\"]/div[@class=\"content\"]')\n if desc:\n ans = self._render_comments(desc[0])\n else:\n ns = tuple(self.selector('#bookDescription_feature_div .a-expander-content'))\n if ns:\n ans = self._render_comments(ns[0])\n\n desc = root.xpath(\n '//div[@id=\"productDescription\"]/*[@class=\"content\"]')\n if desc:\n ans += self._render_comments(desc[0])\n else:\n # Idiot chickens from amazon strike again. This data is now stored\n # in a JS variable inside a script tag URL encoded.\n m = re.search(br'var\\s+iframeContent\\s*=\\s*\"([^\"]+)\"', raw)\n if m is not None:\n try:\n text = unquote(m.group(1)).decode('utf-8')\n nr = parse_html(text)\n desc = nr.xpath(\n '//div[@id=\"productDescription\"]/*[@class=\"content\"]')\n if desc:\n ans += self._render_comments(desc[0])\n except Exception as e:\n self.log.warn(\n 'Parsing of obfuscated product description failed with error: %s' % as_unicode(e))\n else:\n desc = root.xpath('//div[@id=\"productDescription_fullView\"]')\n if desc:\n ans += self._render_comments(desc[0])\n\n return ans\n\n def parse_series(self, root):\n ans = (None, None)\n\n # This is found on the paperback/hardback pages for books on amazon.com\n series = root.xpath('//div[@data-feature-name=\"seriesTitle\"]')\n if series:\n series = series[0]\n spans = series.xpath('./span')\n if spans:\n raw = self.tostring(\n spans[0], encoding='unicode', method='text', with_tail=False).strip()\n m = re.search(r'\\s+([0-9.]+)$', raw.strip())\n if m is not None:\n series_index = float(m.group(1))\n s = series.xpath('./a[@id=\"series-page-link\"]')\n if s:\n series = self.tostring(\n s[0], encoding='unicode', method='text', with_tail=False).strip()\n if series:\n ans = (series, series_index)\n else:\n series = root.xpath('//div[@id=\"seriesBulletWidget_feature_div\"]')\n if series:\n a = series[0].xpath('descendant::a')\n if a:\n raw = self.tostring(a[0], encoding='unicode', method='text', with_tail=False)\n if self.domain == 'jp':\n m = re.search(r'(?P<index>[0-9.]+)\\s*(?:巻|冊)\\s*\\(全\\s*([0-9.]+)\\s*(?:巻|冊)\\):\\s*(?P<series>.+)', raw.strip())\n else:\n m = re.search(r'(?:Book|Libro|Buch)\\s+(?P<index>[0-9.]+)\\s+(?:of|de|von)\\s+([0-9.]+)\\s*:\\s*(?P<series>.+)', raw.strip())\n if m is not None:\n ans = (m.group('series').strip(), float(m.group('index')))\n\n # This is found on Kindle edition pages on amazon.com\n if ans == (None, None):\n for span in root.xpath('//div[@id=\"aboutEbooksSection\"]//li/span'):\n text = (span.text or '').strip()\n m = re.match(r'Book\\s+([0-9.]+)', text)\n if m is not None:\n series_index = float(m.group(1))\n a = span.xpath('./a[@href]')\n if a:\n series = self.tostring(\n a[0], encoding='unicode', method='text', with_tail=False).strip()\n if series:\n ans = (series, series_index)\n # This is found on newer Kindle edition pages on amazon.com\n if ans == (None, None):\n for b in root.xpath('//div[@id=\"reviewFeatureGroup\"]/span/b'):\n text = (b.text or '').strip()\n m = re.match(r'Book\\s+([0-9.]+)', text)\n if m is not None:\n series_index = float(m.group(1))\n a = b.getparent().xpath('./a[@href]')\n if a:\n series = self.tostring(\n a[0], encoding='unicode', method='text', with_tail=False).partition('(')[0].strip()\n if series:\n ans = series, series_index\n\n if ans == (None, None):\n desc = root.xpath('//div[@id=\"ps-content\"]/div[@class=\"buying\"]')\n if desc:\n raw = self.tostring(desc[0], method='text', encoding='unicode')\n raw = re.sub(r'\\s+', ' ', raw)\n match = self.series_pat.search(raw)\n if match is not None:\n s, i = match.group('series'), float(match.group('index'))\n if s:\n ans = (s, i)\n if ans[0]:\n ans = (re.sub(r'\\s+Series$', '', ans[0]).strip(), ans[1])\n ans = (re.sub(r'\\(.+?\\s+Series\\)$', '', ans[0]).strip(), ans[1])\n return ans\n\n def parse_tags(self, root):\n ans = []\n exclude_tokens = {'kindle', 'a-z'}\n exclude = {'special features', 'by authors',\n 'authors & illustrators', 'books', 'new; used & rental textbooks'}\n seen = set()\n for li in root.xpath(self.tags_xpath):\n for i, a in enumerate(li.iterdescendants('a')):\n if i > 0:\n # we ignore the first category since it is almost always\n # too broad\n raw = (a.text or '').strip().replace(',', ';')\n lraw = icu_lower(raw)\n tokens = frozenset(lraw.split())\n if raw and lraw not in exclude and not tokens.intersection(exclude_tokens) and lraw not in seen:\n ans.append(raw)\n seen.add(lraw)\n return ans\n\n def parse_cover(self, root, raw=b\"\"):\n # Look for the image URL in javascript, using the first image in the\n # image gallery as the cover\n import json\n imgpat = re.compile(r\"\"\"'imageGalleryData'\\s*:\\s*(\\[\\s*{.+])\"\"\")\n for script in root.xpath('//script'):\n m = imgpat.search(script.text or '')\n if m is not None:\n try:\n return json.loads(m.group(1))[0]['mainUrl']\n except Exception:\n continue\n\n def clean_img_src(src):\n parts = src.split('/')\n if len(parts) > 3:\n bn = parts[-1]\n sparts = bn.split('_')\n if len(sparts) > 2:\n bn = re.sub(r'\\.\\.jpg$', '.jpg', (sparts[0] + sparts[-1]))\n return ('/'.join(parts[:-1])) + '/' + bn\n\n imgpat2 = re.compile(r'var imageSrc = \"([^\"]+)\"')\n for script in root.xpath('//script'):\n m = imgpat2.search(script.text or '')\n if m is not None:\n src = m.group(1)\n url = clean_img_src(src)\n if url:\n return url\n\n imgs = root.xpath(\n '//img[(@id=\"prodImage\" or @id=\"original-main-image\" or @id=\"main-image\" or @id=\"main-image-nonjs\") and @src]')\n if not imgs:\n imgs = (\n root.xpath('//div[@class=\"main-image-inner-wrapper\"]/img[@src]') or\n root.xpath('//div[@id=\"main-image-container\" or @id=\"ebooks-main-image-container\"]//img[@src]') or\n root.xpath(\n '//div[@id=\"mainImageContainer\"]//img[@data-a-dynamic-image]')\n )\n for img in imgs:\n try:\n idata = json.loads(img.get('data-a-dynamic-image'))\n except Exception:\n imgs = ()\n else:\n mwidth = 0\n try:\n url = None\n for iurl, (width, height) in idata.items():\n if width > mwidth:\n mwidth = width\n url = iurl\n return url\n except Exception:\n pass\n\n for img in imgs:\n src = img.get('src')\n if 'data:' in src:\n continue\n if 'loading-' in src:\n js_img = re.search(br'\"largeImage\":\"(https?://[^\"]+)\",', raw)\n if js_img:\n src = js_img.group(1).decode('utf-8')\n if ('/no-image-avail' not in src and 'loading-' not in src and '/no-img-sm' not in src):\n self.log('Found image: %s' % src)\n url = clean_img_src(src)\n if url:\n return url\n\n def parse_detail_bullets(self, root, mi, container):\n ul = next(self.selector('.detail-bullet-list', root=container))\n for span in self.selector('.a-list-item', root=ul):\n cells = span.xpath('./span')\n if len(cells) >= 2:\n self.parse_detail_cells(mi, cells[0], cells[1])\n\n def parse_new_details(self, root, mi, non_hero):\n table = non_hero.xpath('descendant::table')[0]\n for tr in table.xpath('descendant::tr'):\n cells = tr.xpath('descendant::*[local-name()=\"td\" or local-name()=\"th\"]')\n if len(cells) == 2:\n self.parse_detail_cells(mi, cells[0], cells[1])\n\n def parse_detail_cells(self, mi, c1, c2):\n name = self.totext(c1, only_printable=True).strip().strip(':').strip()\n val = self.totext(c2).strip()\n if not val:\n return\n if name in self.language_names:\n ans = self.lang_map.get(val, None)\n if not ans:\n ans = canonicalize_lang(val)\n if ans:\n mi.language = ans\n elif name in self.publisher_names:\n pub = val.partition(';')[0].partition('(')[0].strip()\n if pub:\n mi.publisher = pub\n date = val.rpartition('(')[-1].replace(')', '').strip()\n try:\n from calibre.utils.date import parse_only_date\n date = self.delocalize_datestr(date)\n mi.pubdate = parse_only_date(date, assume_utc=True)\n except:\n self.log.exception('Failed to parse pubdate: %s' % val)\n elif name in {'ISBN', 'ISBN-10', 'ISBN-13'}:\n ans = check_isbn(val)\n if ans:\n self.isbn = mi.isbn = ans\n elif name in {'Publication date'}:\n from calibre.utils.date import parse_only_date\n date = self.delocalize_datestr(val)\n mi.pubdate = parse_only_date(date, assume_utc=True)\n\n def parse_isbn(self, pd):\n items = pd.xpath(\n 'descendant::*[starts-with(text(), \"ISBN\")]')\n if not items:\n items = pd.xpath(\n 'descendant::b[contains(text(), \"ISBN:\")]')\n for x in reversed(items):\n if x.tail:\n ans = check_isbn(x.tail.strip())\n if ans:\n return ans\n\n def parse_publisher(self, pd):\n for x in reversed(pd.xpath(self.publisher_xpath)):\n if x.tail:\n ans = x.tail.partition(';')[0]\n return ans.partition('(')[0].strip()\n\n def parse_pubdate(self, pd):\n from calibre.utils.date import parse_only_date\n for x in reversed(pd.xpath(self.pubdate_xpath)):\n if x.tail:\n date = x.tail.strip()\n date = self.delocalize_datestr(date)\n try:\n return parse_only_date(date, assume_utc=True)\n except Exception:\n pass\n for x in reversed(pd.xpath(self.publisher_xpath)):\n if x.tail:\n ans = x.tail\n date = ans.rpartition('(')[-1].replace(')', '').strip()\n date = self.delocalize_datestr(date)\n try:\n return parse_only_date(date, assume_utc=True)\n except Exception:\n pass\n\n def parse_language(self, pd):\n for x in reversed(pd.xpath(self.language_xpath)):\n if x.tail:\n raw = x.tail.strip().partition(',')[0].strip()\n ans = self.lang_map.get(raw, None)\n if ans:\n return ans\n ans = canonicalize_lang(ans)\n if ans:\n return ans\n# }}}\n\n\nclass Amazon(Source):\n\n name = 'Amazon.com'\n version = (1, 3, 0)\n minimum_calibre_version = (2, 82, 0)\n description = _('Downloads metadata and covers from Amazon')\n\n capabilities = frozenset(('identify', 'cover'))\n touched_fields = frozenset(('title', 'authors', 'identifier:amazon',\n 'rating', 'comments', 'publisher', 'pubdate',\n 'languages', 'series', 'tags'))\n has_html_comments = True\n supports_gzip_transfer_encoding = True\n prefer_results_with_isbn = False\n\n AMAZON_DOMAINS = {\n 'com': _('US'),\n 'fr': _('France'),\n 'de': _('Germany'),\n 'uk': _('UK'),\n 'au': _('Australia'),\n 'it': _('Italy'),\n 'jp': _('Japan'),\n 'es': _('Spain'),\n 'br': _('Brazil'),\n 'in': _('India'),\n 'nl': _('Netherlands'),\n 'cn': _('China'),\n 'ca': _('Canada'),\n 'se': _('Sweden'),\n }\n\n SERVERS = {\n 'auto': _('Choose server automatically'),\n 'amazon': _('Amazon servers'),\n 'bing': _('Bing search cache'),\n 'google': _('Google search cache'),\n 'wayback': _('Wayback machine cache (slow)'),\n 'ddg': _('DuckDuckGo search and Google cache'),\n }\n\n options = (\n Option('domain', 'choices', 'com', _('Amazon country website to use:'),\n _('Metadata from Amazon will be fetched using this '\n 'country\\'s Amazon website.'), choices=AMAZON_DOMAINS),\n Option('server', 'choices', 'auto', _('Server to get data from:'),\n _(\n 'Amazon has started blocking attempts to download'\n ' metadata from its servers. To get around this problem,'\n ' calibre can fetch the Amazon data from many different'\n ' places where it is cached. Choose the source you prefer.'\n ), choices=SERVERS),\n Option('use_mobi_asin', 'bool', False, _('Use the MOBI-ASIN for metadata search'),\n _(\n 'Enable this option to search for metadata with an'\n ' ASIN identifier from the MOBI file at the current country website,'\n ' unless any other amazon id is available. Note that if the'\n ' MOBI file came from a different Amazon country store, you could get'\n ' incorrect results.'\n )),\n Option('prefer_kindle_edition', 'bool', False, _('Prefer the Kindle edition, when available'),\n _(\n 'When searching for a book and the search engine returns both paper and Kindle editions,'\n ' always prefer the Kindle edition, instead of whatever the search engine returns at the'\n ' top.')\n ),\n )\n\n def __init__(self, *args, **kwargs):\n Source.__init__(self, *args, **kwargs)\n self.set_amazon_id_touched_fields()\n\n def id_from_url(self, url):\n from polyglot.urllib import urlparse\n purl = urlparse(url)\n if purl.netloc and purl.path and '/dp/' in purl.path:\n host_parts = tuple(x.lower() for x in purl.netloc.split('.'))\n if 'amazon' in host_parts:\n domain = host_parts[-1]\n parts = purl.path.split('/')\n idx = parts.index('dp')\n try:\n val = parts[idx+1]\n except IndexError:\n return\n aid = 'amazon' if domain == 'com' else ('amazon_' + domain)\n return aid, val\n\n def test_fields(self, mi):\n '''\n Return the first field from self.touched_fields that is null on the\n mi object\n '''\n for key in self.touched_fields:\n if key.startswith('identifier:'):\n key = key.partition(':')[-1]\n if key == 'amazon':\n if self.domain != 'com':\n key += '_' + self.domain\n if not mi.has_identifier(key):\n return 'identifier: ' + key\n elif mi.is_null(key):\n return key\n\n @property\n def browser(self):\n br = self._browser\n if br is None:\n ua = 'Mobile '\n while not user_agent_is_ok(ua):\n ua = random_user_agent(allow_ie=False)\n # ua = 'Mozilla/5.0 (Linux; Android 8.0.0; VTR-L29; rv:63.0) Gecko/20100101 Firefox/63.0'\n self._browser = br = browser(user_agent=ua)\n br.set_handle_gzip(True)\n if self.use_search_engine:\n br.addheaders += [\n ('Accept', accept_header_for_ua(ua)),\n ('Upgrade-insecure-requests', '1'),\n ]\n else:\n br.addheaders += [\n ('Accept', accept_header_for_ua(ua)),\n ('Upgrade-insecure-requests', '1'),\n ('Referer', self.referrer_for_domain()),\n ]\n return br\n\n def save_settings(self, *args, **kwargs):\n Source.save_settings(self, *args, **kwargs)\n self.set_amazon_id_touched_fields()\n\n def set_amazon_id_touched_fields(self):\n ident_name = \"identifier:amazon\"\n if self.domain != 'com':\n ident_name += '_' + self.domain\n tf = [x for x in self.touched_fields if not\n x.startswith('identifier:amazon')] + [ident_name]\n self.touched_fields = frozenset(tf)\n\n def get_domain_and_asin(self, identifiers, extra_domains=()):\n identifiers = {k.lower(): v for k, v in identifiers.items()}\n for key, val in identifiers.items():\n if key in ('amazon', 'asin'):\n return 'com', val\n if key.startswith('amazon_'):\n domain = key.partition('_')[-1]\n if domain and (domain in self.AMAZON_DOMAINS or domain in extra_domains):\n return domain, val\n if self.prefs['use_mobi_asin']:\n val = identifiers.get('mobi-asin')\n if val is not None:\n return self.domain, val\n return None, None\n\n def referrer_for_domain(self, domain=None):\n domain = domain or self.domain\n return {\n 'uk': 'https://www.amazon.co.uk/',\n 'au': 'https://www.amazon.com.au/',\n 'br': 'https://www.amazon.com.br/',\n 'jp': 'https://www.amazon.co.jp/',\n }.get(domain, 'https://www.amazon.%s/' % domain)\n\n def _get_book_url(self, identifiers): # {{{\n domain, asin = self.get_domain_and_asin(\n identifiers, extra_domains=('au', 'ca'))\n if domain and asin:\n url = None\n r = self.referrer_for_domain(domain)\n if r is not None:\n url = r + 'dp/' + asin\n if url:\n idtype = 'amazon' if domain == 'com' else 'amazon_' + domain\n return domain, idtype, asin, url\n\n def get_book_url(self, identifiers):\n ans = self._get_book_url(identifiers)\n if ans is not None:\n return ans[1:]\n\n def get_book_url_name(self, idtype, idval, url):\n if idtype == 'amazon':\n return self.name\n return 'A' + idtype.replace('_', '.')[1:]\n # }}}\n\n @property\n def domain(self):\n x = getattr(self, 'testing_domain', None)\n if x is not None:\n return x\n domain = self.prefs['domain']\n if domain not in self.AMAZON_DOMAINS:\n domain = 'com'\n\n return domain\n\n @property\n def server(self):\n x = getattr(self, 'testing_server', None)\n if x is not None:\n return x\n server = self.prefs['server']\n if server not in self.SERVERS:\n server = 'auto'\n return server\n\n @property\n def use_search_engine(self):\n return self.server != 'amazon'\n\n def clean_downloaded_metadata(self, mi):\n docase = (\n mi.language == 'eng' or\n (mi.is_null('language') and self.domain in {'com', 'uk', 'au'})\n )\n if mi.title and docase:\n # Remove series information from title\n m = re.search(r'\\S+\\s+(\\(.+?\\s+Book\\s+\\d+\\))$', mi.title)\n if m is not None:\n mi.title = mi.title.replace(m.group(1), '').strip()\n mi.title = fixcase(mi.title)\n mi.authors = fixauthors(mi.authors)\n if mi.tags and docase:\n mi.tags = list(map(fixcase, mi.tags))\n mi.isbn = check_isbn(mi.isbn)\n if mi.series and docase:\n mi.series = fixcase(mi.series)\n if mi.title and mi.series:\n for pat in (r':\\s*Book\\s+\\d+\\s+of\\s+%s$', r'\\(%s\\)$', r':\\s*%s\\s+Book\\s+\\d+$'):\n pat = pat % re.escape(mi.series)\n q = re.sub(pat, '', mi.title, flags=re.I).strip()\n if q and q != mi.title:\n mi.title = q\n break\n\n def get_website_domain(self, domain):\n return {'uk': 'co.uk', 'jp': 'co.jp', 'br': 'com.br', 'au': 'com.au'}.get(domain, domain)\n\n def create_query(self, log, title=None, authors=None, identifiers={}, # {{{\n domain=None, for_amazon=True):\n try:\n from urllib.parse import urlencode, unquote_plus\n except ImportError:\n from urllib import urlencode, unquote_plus\n if domain is None:\n domain = self.domain\n\n idomain, asin = self.get_domain_and_asin(identifiers)\n if idomain is not None:\n domain = idomain\n\n # See the amazon detailed search page to get all options\n terms = []\n q = {'search-alias': 'aps',\n 'unfiltered': '1',\n }\n\n if domain == 'com':\n q['sort'] = 'relevanceexprank'\n else:\n q['sort'] = 'relevancerank'\n\n isbn = check_isbn(identifiers.get('isbn', None))\n\n if asin is not None:\n q['field-keywords'] = asin\n terms.append(asin)\n elif isbn is not None:\n q['field-isbn'] = isbn\n if len(isbn) == 13:\n terms.extend('({} OR {}-{})'.format(isbn, isbn[:3], isbn[3:]).split())\n else:\n terms.append(isbn)\n else:\n # Only return book results\n q['search-alias'] = {'br': 'digital-text',\n 'nl': 'aps'}.get(domain, 'stripbooks')\n if title:\n title_tokens = list(self.get_title_tokens(title))\n if title_tokens:\n q['field-title'] = ' '.join(title_tokens)\n terms.extend(title_tokens)\n if authors:\n author_tokens = list(self.get_author_tokens(authors,\n only_first_author=True))\n if author_tokens:\n q['field-author'] = ' '.join(author_tokens)\n terms.extend(author_tokens)\n\n if not ('field-keywords' in q or 'field-isbn' in q or\n ('field-title' in q)):\n # Insufficient metadata to make an identify query\n return None, None\n\n if not for_amazon:\n return terms, domain\n\n if domain == 'nl':\n q['__mk_nl_NL'] = 'ÅMÅŽÕÑ'\n if 'field-keywords' not in q:\n q['field-keywords'] = ''\n for f in 'field-isbn field-title field-author'.split():\n q['field-keywords'] += ' ' + q.pop(f, '')\n q['field-keywords'] = q['field-keywords'].strip()\n\n encoded_q = dict([(x.encode('utf-8', 'ignore'), y.encode(\n 'utf-8', 'ignore')) for x, y in q.items()])\n url_query = urlencode(encoded_q)\n # amazon's servers want IRIs with unicode characters not percent esaped\n parts = []\n for x in url_query.split(b'&' if isinstance(url_query, bytes) else '&'):\n k, v = x.split(b'=' if isinstance(x, bytes) else '=', 1)\n parts.append('{}={}'.format(iri_quote_plus(unquote_plus(k)), iri_quote_plus(unquote_plus(v))))\n url_query = '&'.join(parts)\n url = 'https://www.amazon.%s/s/?' % self.get_website_domain(\n domain) + url_query\n return url, domain\n\n # }}}\n\n def get_cached_cover_url(self, identifiers): # {{{\n url = None\n domain, asin = self.get_domain_and_asin(identifiers)\n if asin is None:\n isbn = identifiers.get('isbn', None)\n if isbn is not None:\n asin = self.cached_isbn_to_identifier(isbn)\n if asin is not None:\n url = self.cached_identifier_to_cover_url(asin)\n\n return url\n # }}}\n\n def parse_results_page(self, root, domain): # {{{\n from lxml.html import tostring\n\n matches = []\n\n def title_ok(title):\n title = title.lower()\n bad = ['bulk pack', '[audiobook]', '[audio cd]',\n '(a book companion)', '( slipcase with door )', ': free sampler']\n if self.domain == 'com':\n bad.extend(['(%s edition)' % x for x in ('spanish', 'german')])\n for x in bad:\n if x in title:\n return False\n if title and title[0] in '[{' and re.search(r'\\(\\s*author\\s*\\)', title) is not None:\n # Bad entries in the catalog\n return False\n return True\n\n for query in (\n '//div[contains(@class, \"s-result-list\")]//h2/a[@href]',\n '//div[contains(@class, \"s-result-list\")]//div[@data-index]//h5//a[@href]',\n r'//li[starts-with(@id, \"result_\")]//a[@href and contains(@class, \"s-access-detail-page\")]',\n ):\n result_links = root.xpath(query)\n if result_links:\n break\n for a in result_links:\n title = tostring(a, method='text', encoding='unicode')\n if title_ok(title):\n url = a.get('href')\n if url.startswith('/'):\n url = 'https://www.amazon.%s%s' % (\n self.get_website_domain(domain), url)\n matches.append(url)\n\n if not matches:\n # Previous generation of results page markup\n for div in root.xpath(r'//div[starts-with(@id, \"result_\")]'):\n links = div.xpath(r'descendant::a[@class=\"title\" and @href]')\n if not links:\n # New amazon markup\n links = div.xpath('descendant::h3/a[@href]')\n for a in links:\n title = tostring(a, method='text', encoding='unicode')\n if title_ok(title):\n url = a.get('href')\n if url.startswith('/'):\n url = 'https://www.amazon.%s%s' % (\n self.get_website_domain(domain), url)\n matches.append(url)\n break\n\n if not matches:\n # This can happen for some user agents that Amazon thinks are\n # mobile/less capable\n for td in root.xpath(\n r'//div[@id=\"Results\"]/descendant::td[starts-with(@id, \"search:Td:\")]'):\n for a in td.xpath(r'descendant::td[@class=\"dataColumn\"]/descendant::a[@href]/span[@class=\"srTitle\"]/..'):\n title = tostring(a, method='text', encoding='unicode')\n if title_ok(title):\n url = a.get('href')\n if url.startswith('/'):\n url = 'https://www.amazon.%s%s' % (\n self.get_website_domain(domain), url)\n matches.append(url)\n break\n if not matches and root.xpath('//form[@action=\"/errors/validateCaptcha\"]'):\n raise CaptchaError('Amazon returned a CAPTCHA page. Recently Amazon has begun using statistical'\n ' profiling to block access to its website. As such this metadata plugin is'\n ' unlikely to ever work reliably.')\n\n # Keep only the top 3 matches as the matches are sorted by relevance by\n # Amazon so lower matches are not likely to be very relevant\n return matches[:3]\n # }}}\n\n def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{\n from calibre.utils.cleantext import clean_ascii_chars\n from calibre.ebooks.chardet import xml_to_unicode\n matches = []\n query, domain = self.create_query(log, title=title, authors=authors,\n identifiers=identifiers)\n if query is None:\n log.error('Insufficient metadata to construct query')\n raise SearchFailed()\n try:\n raw = br.open_novisit(query, timeout=timeout).read().strip()\n except Exception as e:\n if callable(getattr(e, 'getcode', None)) and \\\n e.getcode() == 404:\n log.error('Query malformed: %r' % query)\n raise SearchFailed()\n attr = getattr(e, 'args', [None])\n attr = attr if attr else [None]\n if isinstance(attr[0], socket.timeout):\n msg = _('Amazon timed out. Try again later.')\n log.error(msg)\n else:\n msg = 'Failed to make identify query: %r' % query\n log.exception(msg)\n raise SearchFailed()\n\n raw = clean_ascii_chars(xml_to_unicode(raw,\n strip_encoding_pats=True, resolve_entities=True)[0])\n\n if testing:\n import tempfile\n with tempfile.NamedTemporaryFile(prefix='amazon_results_',\n suffix='.html', delete=False) as f:\n f.write(raw.encode('utf-8'))\n print('Downloaded html for results page saved in', f.name)\n\n matches = []\n found = '<title>404 - ' not in raw\n\n if found:\n try:\n root = parse_html(raw)\n except Exception:\n msg = 'Failed to parse amazon page for query: %r' % query\n log.exception(msg)\n raise SearchFailed()\n\n matches = self.parse_results_page(root, domain)\n\n return matches, query, domain, None\n # }}}\n\n def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout, override_server=None): # {{{\n from calibre.ebooks.metadata.sources.update import search_engines_module\n se = search_engines_module()\n terms, domain = self.create_query(log, title=title, authors=authors,\n identifiers=identifiers, for_amazon=False)\n site = self.referrer_for_domain(\n domain)[len('https://'):].partition('/')[0]\n matches = []\n server = override_server or self.server\n urlproc, sfunc = se.google_url_processor, se.google_search\n if server == 'bing':\n urlproc, sfunc = se.bing_url_processor, se.bing_search\n elif server == 'wayback':\n urlproc, sfunc = se.wayback_url_processor, se.ddg_search\n elif server == 'ddg':\n urlproc, sfunc = se.ddg_url_processor, se.ddg_search\n try:\n results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)\n except HTTPError as err:\n if err.code == 429 and sfunc is se.google_search:\n log('Got too many requests error from Google, trying via DuckDuckGo')\n urlproc, sfunc = se.ddg_url_processor, se.ddg_search\n results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)\n else:\n raise\n\n br.set_current_header('Referer', qurl)\n for result in results:\n if abort.is_set():\n return matches, terms, domain, None\n\n purl = urlparse(result.url)\n if '/dp/' in purl.path and site in purl.netloc:\n url = result.cached_url\n if url is None:\n url = se.get_cached_url(result.url, br, timeout=timeout)\n if url is None:\n log('Failed to find cached page for:', result.url)\n continue\n if url not in matches:\n matches.append(url)\n if len(matches) >= 3:\n break\n else:\n log('Skipping non-book result:', result)\n if not matches:\n log('No search engine results for terms:', ' '.join(terms))\n if urlproc is se.google_url_processor:\n # Google does not cache adult titles\n log('Trying the bing search engine instead')\n return self.search_search_engine(br, testing, log, abort, title, authors, identifiers, timeout, 'bing')\n return matches, terms, domain, urlproc\n # }}}\n\n def identify(self, log, result_queue, abort, title=None, authors=None, # {{{\n identifiers={}, timeout=60):\n '''\n Note this method will retry without identifiers automatically if no\n match is found with identifiers.\n '''\n\n testing = getattr(self, 'running_a_test', False)\n\n udata = self._get_book_url(identifiers)\n br = self.browser\n log('User-agent:', br.current_user_agent())\n log('Server:', self.server)\n if testing:\n print('User-agent:', br.current_user_agent())\n if udata is not None and not self.use_search_engine:\n # Try to directly get details page instead of running a search\n # Cannot use search engine as the directly constructed URL is\n # usually redirected to a full URL by amazon, and is therefore\n # not cached\n domain, idtype, asin, durl = udata\n if durl is not None:\n preparsed_root = parse_details_page(\n durl, log, timeout, br, domain)\n if preparsed_root is not None:\n qasin = parse_asin(preparsed_root[1], log, durl)\n if qasin == asin:\n w = Worker(durl, result_queue, br, log, 0, domain,\n self, testing=testing, preparsed_root=preparsed_root, timeout=timeout)\n try:\n w.get_details()\n return\n except Exception:\n log.exception(\n 'get_details failed for url: %r' % durl)\n func = self.search_search_engine if self.use_search_engine else self.search_amazon\n try:\n matches, query, domain, cover_url_processor = func(\n br, testing, log, abort, title, authors, identifiers, timeout)\n except SearchFailed:\n return\n\n if abort.is_set():\n return\n\n if not matches:\n if identifiers and title and authors:\n log('No matches found with identifiers, retrying using only'\n ' title and authors. Query: %r' % query)\n time.sleep(1)\n return self.identify(log, result_queue, abort, title=title,\n authors=authors, timeout=timeout)\n log.error('No matches found with query: %r' % query)\n return\n\n if self.prefs['prefer_kindle_edition']:\n matches = sort_matches_preferring_kindle_editions(matches)\n\n workers = [Worker(\n url, result_queue, br, log, i, domain, self, testing=testing, timeout=timeout,\n cover_url_processor=cover_url_processor, filter_result=partial(\n self.filter_result, title, authors, identifiers)) for i, url in enumerate(matches)]\n\n for w in workers:\n # Don't send all requests at the same time\n time.sleep(1)\n w.start()\n if abort.is_set():\n return\n\n while not abort.is_set():\n a_worker_is_alive = False\n for w in workers:\n w.join(0.2)\n if abort.is_set():\n break\n if w.is_alive():\n a_worker_is_alive = True\n if not a_worker_is_alive:\n break\n\n return None\n # }}}\n\n def filter_result(self, title, authors, identifiers, mi, log): # {{{\n if not self.use_search_engine:\n return True\n if title is not None:\n\n def tokenize_title(x):\n return icu_lower(x).replace(\"'\", '').replace('\"', '').rstrip(':')\n\n tokens = {tokenize_title(x) for x in title.split() if len(x) > 3}\n if tokens:\n result_tokens = {tokenize_title(x) for x in mi.title.split()}\n if not tokens.intersection(result_tokens):\n log('Ignoring result:', mi.title, 'as its title does not match')\n return False\n if authors:\n author_tokens = set()\n for author in authors:\n author_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2}\n result_tokens = set()\n for author in mi.authors:\n result_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2}\n if author_tokens and not author_tokens.intersection(result_tokens):\n log('Ignoring result:', mi.title, 'by', ' & '.join(mi.authors), 'as its author does not match')\n return False\n return True\n # }}}\n\n def download_cover(self, log, result_queue, abort, # {{{\n title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False):\n cached_url = self.get_cached_cover_url(identifiers)\n if cached_url is None:\n log.info('No cached cover found, running identify')\n rq = Queue()\n self.identify(log, rq, abort, title=title, authors=authors,\n identifiers=identifiers)\n if abort.is_set():\n return\n if abort.is_set():\n return\n results = []\n while True:\n try:\n results.append(rq.get_nowait())\n except Empty:\n break\n results.sort(key=self.identify_results_keygen(\n title=title, authors=authors, identifiers=identifiers))\n for mi in results:\n cached_url = self.get_cached_cover_url(mi.identifiers)\n if cached_url is not None:\n break\n if cached_url is None:\n log.info('No cover found')\n return\n\n if abort.is_set():\n return\n log('Downloading cover from:', cached_url)\n br = self.browser\n if self.use_search_engine:\n br = br.clone_browser()\n br.set_current_header('Referer', self.referrer_for_domain(self.domain))\n try:\n time.sleep(1)\n cdata = br.open_novisit(\n cached_url, timeout=timeout).read()\n result_queue.put((self, cdata))\n except:\n log.exception('Failed to download cover from:', cached_url)\n # }}}\n\n\ndef manual_tests(domain, **kw): # {{{\n # To run these test use:\n # calibre-debug -c \"from calibre.ebooks.metadata.sources.amazon import *; manual_tests('com')\"\n from calibre.ebooks.metadata.sources.test import (test_identify_plugin,\n isbn_test, title_test, authors_test, comments_test, series_test)\n all_tests = {}\n all_tests['com'] = [ # {{{\n ( # Paperback with series\n {'identifiers': {'amazon': '1423146786'}},\n [title_test('The Heroes of Olympus, Book Five The Blood of Olympus',\n exact=True), series_test('The Heroes of Olympus', 5)]\n ),\n\n ( # Kindle edition with series\n {'identifiers': {'amazon': 'B0085UEQDO'}},\n [title_test('Three Parts Dead', exact=True),\n series_test('Craft Sequence', 1)]\n ),\n\n ( # + in title and uses id=\"main-image\" for cover\n {'identifiers': {'amazon': '1933988770'}},\n [title_test(\n 'C++ Concurrency in Action: Practical Multithreading', exact=True)]\n ),\n\n\n ( # Different comments markup, using Book Description section\n {'identifiers': {'amazon': '0982514506'}},\n [title_test(\n \"Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy\",\n exact=True),\n comments_test('Jelena'), comments_test('Ashinji'),\n ]\n ),\n\n ( # # in title\n {'title': 'Expert C# 2008 Business Objects',\n 'authors': ['Lhotka']},\n [title_test('Expert C#'),\n authors_test(['Rockford Lhotka'])\n ]\n ),\n\n ( # No specific problems\n {'identifiers': {'isbn': '0743273567'}},\n [title_test('the great gatsby: the only authorized edition', exact=True),\n authors_test(['Francis Scott Fitzgerald'])]\n ),\n\n ]\n\n # }}}\n\n all_tests['de'] = [ # {{{\n # series\n (\n {'identifiers': {'isbn': '3499275120'}},\n [title_test('Vespasian: Das Schwert des Tribuns: Historischer Roman',\n exact=False), authors_test(['Robert Fabbri']), series_test('Die Vespasian-Reihe', 1)\n ]\n\n ),\n\n ( # umlaut in title/authors\n {'title': 'Flüsternde Wälder',\n 'authors': ['Nicola Förg']},\n [title_test('Flüsternde Wälder'),\n authors_test(['Nicola Förg'], subset=True)\n ]\n ),\n\n (\n {'identifiers': {'isbn': '9783453314979'}},\n [title_test('Die letzten Wächter: Roman',\n exact=False), authors_test(['Sergej Lukianenko'])\n ]\n\n ),\n\n (\n {'identifiers': {'isbn': '3548283519'}},\n [title_test('Wer Wind Sät: Der Fünfte Fall Für Bodenstein Und Kirchhoff',\n exact=False), authors_test(['Nele Neuhaus'])\n ]\n\n ),\n ] # }}}\n\n all_tests['it'] = [ # {{{\n (\n {'identifiers': {'isbn': '8838922195'}},\n [title_test('La briscola in cinque',\n exact=True), authors_test(['Marco Malvaldi'])\n ]\n\n ),\n ] # }}}\n\n all_tests['fr'] = [ # {{{\n (\n {'identifiers': {'amazon_fr': 'B07L7ST4RS'}},\n [title_test('Le secret de Lola', exact=True),\n authors_test(['Amélie BRIZIO'])\n ]\n ),\n (\n {'identifiers': {'isbn': '2221116798'}},\n [title_test('L\\'étrange voyage de Monsieur Daldry',\n exact=True), authors_test(['Marc Levy'])\n ]\n\n ),\n ] # }}}\n\n all_tests['es'] = [ # {{{\n (\n {'identifiers': {'isbn': '8483460831'}},\n [title_test('Tiempos Interesantes',\n exact=False), authors_test(['Terry Pratchett'])\n ]\n\n ),\n ] # }}}\n\n all_tests['se'] = [ # {{{\n (\n {'identifiers': {'isbn': '9780552140287'}},\n [title_test('Men At Arms: A Discworld Novel: 14',\n exact=False), authors_test(['Terry Pratchett'])\n ]\n\n ),\n ] # }}}\n\n all_tests['jp'] = [ # {{{\n ( # Adult filtering test\n {'identifiers': {'isbn': '4799500066'}},\n [title_test(' '), ]\n ),\n\n ( # isbn -> title, authors\n {'identifiers': {'isbn': '9784101302720'}},\n [title_test('精霊の守り人',\n exact=True), authors_test(['上橋 菜穂子'])\n ]\n ),\n ( # title, authors -> isbn (will use Shift_JIS encoding in query.)\n {'title': '考えない練習',\n 'authors': ['小池 龍之介']},\n [isbn_test('9784093881067'), ]\n ),\n ] # }}}\n\n all_tests['br'] = [ # {{{\n (\n {'title': 'Guerra dos Tronos'},\n [title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo',\n exact=True), authors_test(['George R. R. Martin'])\n ]\n\n ),\n ] # }}}\n\n all_tests['nl'] = [ # {{{\n (\n {'title': 'Freakonomics'},\n [title_test('Freakonomics',\n exact=True), authors_test(['Steven Levitt & Stephen Dubner & R. Kuitenbrouwer & O. Brenninkmeijer & A. van Den Berg'])\n ]\n\n ),\n ] # }}}\n\n all_tests['cn'] = [ # {{{\n (\n {'identifiers': {'isbn': '9787115369512'}},\n [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True),\n authors_test(['[美]sam Williams', '邓楠,李凡希'])]\n ),\n (\n {'title': '爱上Raspberry Pi'},\n [title_test('爱上Raspberry Pi',\n exact=True), authors_test(['Matt Richardson', 'Shawn Wallace', '李凡希'])\n ]\n\n ),\n ] # }}}\n\n all_tests['ca'] = [ # {{{\n ( # Paperback with series\n {'identifiers': {'isbn': '9781623808747'}},\n [title_test('Parting Shot', exact=True),\n authors_test(['Mary Calmes'])]\n ),\n ( # # in title\n {'title': 'Expert C# 2008 Business Objects',\n 'authors': ['Lhotka']},\n [title_test('Expert C# 2008 Business Objects'),\n authors_test(['Rockford Lhotka'])]\n ),\n ( # noscript description\n {'identifiers': {'amazon_ca': '162380874X'}},\n [title_test('Parting Shot', exact=True), authors_test(['Mary Calmes'])\n ]\n ),\n ] # }}}\n\n all_tests['in'] = [ # {{{\n ( # Paperback with series\n {'identifiers': {'amazon_in': '1423146786'}},\n [title_test('The Heroes of Olympus, Book Five The Blood of Olympus', exact=True)]\n ),\n ] # }}}\n\n def do_test(domain, start=0, stop=None, server='auto'):\n tests = all_tests[domain]\n if stop is None:\n stop = len(tests)\n tests = tests[start:stop]\n test_identify_plugin(Amazon.name, tests, modify_plugin=lambda p: (\n setattr(p, 'testing_domain', domain),\n setattr(p, 'touched_fields', p.touched_fields - {'tags'}),\n setattr(p, 'testing_server', server),\n ))\n\n do_test(domain, **kw)\n# }}}\n",
"big_book_search": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__ = 'GPL v3'\n__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nfrom calibre.ebooks.metadata.sources.base import Source, Option\n\n\ndef get_urls(br, tokens):\n from urllib.parse import quote_plus\n from html5_parser import parse\n escaped = (quote_plus(x) for x in tokens if x and x.strip())\n q = '+'.join(escaped)\n url = 'https://bigbooksearch.com/please-dont-scrape-my-site-you-will-put-my-api-key-over-the-usage-limit-and-the-site-will-break/books/'+q\n raw = br.open(url).read()\n root = parse(raw.decode('utf-8'))\n urls = [i.get('src') for i in root.xpath('//img[@src]')]\n return urls\n\n\nclass BigBookSearch(Source):\n\n name = 'Big Book Search'\n version = (1, 0, 1)\n minimum_calibre_version = (2, 80, 0)\n description = _('Downloads multiple book covers from Amazon. Useful to find alternate covers.')\n capabilities = frozenset(['cover'])\n can_get_multiple_covers = True\n options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),\n _('The maximum number of covers to process from the search result')),\n )\n supports_gzip_transfer_encoding = True\n\n def download_cover(self, log, result_queue, abort,\n title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n if not title:\n return\n br = self.browser\n tokens = tuple(self.get_title_tokens(title)) + tuple(self.get_author_tokens(authors))\n urls = get_urls(br, tokens)\n self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)\n\n\ndef test():\n from calibre import browser\n import pprint\n br = browser()\n urls = get_urls(br, ['consider', 'phlebas', 'banks'])\n pprint.pprint(urls)\n\n\nif __name__ == '__main__':\n test()\n",
"edelweiss": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__ = 'GPL v3'\n__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nimport time, re\nfrom threading import Thread\ntry:\n from queue import Empty, Queue\nexcept ImportError:\n from Queue import Empty, Queue\n\nfrom calibre import as_unicode, random_user_agent\nfrom calibre.ebooks.metadata import check_isbn\nfrom calibre.ebooks.metadata.sources.base import Source\n\n\ndef clean_html(raw):\n from calibre.ebooks.chardet import xml_to_unicode\n from calibre.utils.cleantext import clean_ascii_chars\n return clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True,\n resolve_entities=True, assume_utf8=True)[0])\n\n\ndef parse_html(raw):\n raw = clean_html(raw)\n from html5_parser import parse\n return parse(raw)\n\n\ndef astext(node):\n from lxml import etree\n return etree.tostring(node, method='text', encoding='unicode',\n with_tail=False).strip()\n\n\nclass Worker(Thread): # {{{\n\n def __init__(self, basic_data, relevance, result_queue, br, timeout, log, plugin):\n Thread.__init__(self)\n self.daemon = True\n self.basic_data = basic_data\n self.br, self.log, self.timeout = br, log, timeout\n self.result_queue, self.plugin, self.sku = result_queue, plugin, self.basic_data['sku']\n self.relevance = relevance\n\n def run(self):\n url = ('https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/product/two_Enhanced.ascx&'\n 'sku={0}&idPrefix=content_1_{0}&mode=0'.format(self.sku))\n try:\n raw = self.br.open_novisit(url, timeout=self.timeout).read()\n except:\n self.log.exception('Failed to load comments page: %r'%url)\n return\n\n try:\n mi = self.parse(raw)\n mi.source_relevance = self.relevance\n self.plugin.clean_downloaded_metadata(mi)\n self.result_queue.put(mi)\n except:\n self.log.exception('Failed to parse details for sku: %s'%self.sku)\n\n def parse(self, raw):\n from calibre.ebooks.metadata.book.base import Metadata\n from calibre.utils.date import UNDEFINED_DATE\n root = parse_html(raw)\n mi = Metadata(self.basic_data['title'], self.basic_data['authors'])\n\n # Identifiers\n if self.basic_data['isbns']:\n mi.isbn = self.basic_data['isbns'][0]\n mi.set_identifier('edelweiss', self.sku)\n\n # Tags\n if self.basic_data['tags']:\n mi.tags = self.basic_data['tags']\n mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]\n\n # Publisher\n mi.publisher = self.basic_data['publisher']\n\n # Pubdate\n if self.basic_data['pubdate'] and self.basic_data['pubdate'].year != UNDEFINED_DATE:\n mi.pubdate = self.basic_data['pubdate']\n\n # Rating\n if self.basic_data['rating']:\n mi.rating = self.basic_data['rating']\n\n # Comments\n comments = ''\n for cid in ('summary', 'contributorbio', 'quotes_reviews'):\n cid = 'desc_{}{}-content'.format(cid, self.sku)\n div = root.xpath('//*[@id=\"{}\"]'.format(cid))\n if div:\n comments += self.render_comments(div[0])\n if comments:\n mi.comments = comments\n\n mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None\n return mi\n\n def render_comments(self, desc):\n from lxml import etree\n from calibre.library.comments import sanitize_comments_html\n for c in desc.xpath('descendant::noscript'):\n c.getparent().remove(c)\n for a in desc.xpath('descendant::a[@href]'):\n del a.attrib['href']\n a.tag = 'span'\n desc = etree.tostring(desc, method='html', encoding='unicode').strip()\n\n # remove all attributes from tags\n desc = re.sub(r'<([a-zA-Z0-9]+)\\s[^>]+>', r'<\\1>', desc)\n # Collapse whitespace\n # desc = re.sub('\\n+', '\\n', desc)\n # desc = re.sub(' +', ' ', desc)\n # Remove comments\n desc = re.sub(r'(?s)<!--.*?-->', '', desc)\n return sanitize_comments_html(desc)\n# }}}\n\n\ndef get_basic_data(browser, log, *skus):\n from calibre.utils.date import parse_only_date\n from mechanize import Request\n zeroes = ','.join('0' for sku in skus)\n data = {\n 'skus': ','.join(skus),\n 'drc': zeroes,\n 'startPosition': '0',\n 'sequence': '1',\n 'selected': zeroes,\n 'itemID': '0',\n 'orderID': '0',\n 'mailingID': '',\n 'tContentWidth': '926',\n 'originalOrder': ','.join(type('')(i) for i in range(len(skus))),\n 'selectedOrderID': '0',\n 'selectedSortColumn': '0',\n 'listType': '1',\n 'resultType': '32',\n 'blockView': '1',\n }\n items_data_url = 'https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/listviews/ListView_Title_Multi.ascx'\n req = Request(items_data_url, data)\n response = browser.open_novisit(req)\n raw = response.read()\n root = parse_html(raw)\n for item in root.xpath('//div[@data-priority]'):\n row = item.getparent().getparent()\n sku = item.get('id').split('-')[-1]\n isbns = [x.strip() for x in row.xpath('descendant::*[contains(@class, \"pev_sku\")]/text()')[0].split(',') if check_isbn(x.strip())]\n isbns.sort(key=len, reverse=True)\n try:\n tags = [x.strip() for x in astext(row.xpath('descendant::*[contains(@class, \"pev_categories\")]')[0]).split('/')]\n except IndexError:\n tags = []\n rating = 0\n for bar in row.xpath('descendant::*[contains(@class, \"bgdColorCommunity\")]/@style'):\n m = re.search(r'width: (\\d+)px;.*max-width: (\\d+)px', bar)\n if m is not None:\n rating = float(m.group(1)) / float(m.group(2))\n break\n try:\n pubdate = parse_only_date(astext(row.xpath('descendant::*[contains(@class, \"pev_shipDate\")]')[0]\n ).split(':')[-1].split(u'\\xa0')[-1].strip(), assume_utc=True)\n except Exception:\n log.exception('Error parsing published date')\n pubdate = None\n authors = []\n for x in [x.strip() for x in row.xpath('descendant::*[contains(@class, \"pev_contributor\")]/@title')]:\n authors.extend(a.strip() for a in x.split(','))\n entry = {\n 'sku': sku,\n 'cover': row.xpath('descendant::img/@src')[0].split('?')[0],\n 'publisher': astext(row.xpath('descendant::*[contains(@class, \"headerPublisher\")]')[0]),\n 'title': astext(row.xpath('descendant::*[@id=\"title_{}\"]'.format(sku))[0]),\n 'authors': authors,\n 'isbns': isbns,\n 'tags': tags,\n 'pubdate': pubdate,\n 'format': ' '.join(row.xpath('descendant::*[contains(@class, \"pev_format\")]/text()')).strip(),\n 'rating': rating,\n }\n if entry['cover'].startswith('/'):\n entry['cover'] = None\n yield entry\n\n\nclass Edelweiss(Source):\n\n name = 'Edelweiss'\n version = (2, 0, 1)\n minimum_calibre_version = (3, 6, 0)\n description = _('Downloads metadata and covers from Edelweiss - A catalog updated by book publishers')\n\n capabilities = frozenset(['identify', 'cover'])\n touched_fields = frozenset([\n 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',\n 'identifier:isbn', 'identifier:edelweiss', 'rating'])\n supports_gzip_transfer_encoding = True\n has_html_comments = True\n\n @property\n def user_agent(self):\n # Pass in an index to random_user_agent() to test with a particular\n # user agent\n return random_user_agent(allow_ie=False)\n\n def _get_book_url(self, sku):\n if sku:\n return 'https://www.edelweiss.plus/#sku={}&page=1'.format(sku)\n\n def get_book_url(self, identifiers): # {{{\n sku = identifiers.get('edelweiss', None)\n if sku:\n return 'edelweiss', sku, self._get_book_url(sku)\n\n # }}}\n\n def get_cached_cover_url(self, identifiers): # {{{\n sku = identifiers.get('edelweiss', None)\n if not sku:\n isbn = identifiers.get('isbn', None)\n if isbn is not None:\n sku = self.cached_isbn_to_identifier(isbn)\n return self.cached_identifier_to_cover_url(sku)\n # }}}\n\n def create_query(self, log, title=None, authors=None, identifiers={}):\n try:\n from urllib.parse import urlencode\n except ImportError:\n from urllib import urlencode\n import time\n BASE_URL = ('https://www.edelweiss.plus/GetTreelineControl.aspx?'\n 'controlName=/uc/listviews/controls/ListView_data.ascx&itemID=0&resultType=32&dashboardType=8&itemType=1&dataType=products&keywordSearch&')\n keywords = []\n isbn = check_isbn(identifiers.get('isbn', None))\n if isbn is not None:\n keywords.append(isbn)\n elif title:\n title_tokens = list(self.get_title_tokens(title))\n if title_tokens:\n keywords.extend(title_tokens)\n author_tokens = self.get_author_tokens(authors, only_first_author=True)\n if author_tokens:\n keywords.extend(author_tokens)\n if not keywords:\n return None\n params = {\n 'q': (' '.join(keywords)).encode('utf-8'),\n '_': type('')(int(time.time()))\n }\n return BASE_URL+urlencode(params)\n\n # }}}\n\n def identify(self, log, result_queue, abort, title=None, authors=None, # {{{\n identifiers={}, timeout=30):\n import json\n\n br = self.browser\n br.addheaders = [\n ('Referer', 'https://www.edelweiss.plus/'),\n ('X-Requested-With', 'XMLHttpRequest'),\n ('Cache-Control', 'no-cache'),\n ('Pragma', 'no-cache'),\n ]\n if 'edelweiss' in identifiers:\n items = [identifiers['edelweiss']]\n else:\n log.error('Currently Edelweiss returns random books for search queries')\n return\n query = self.create_query(log, title=title, authors=authors,\n identifiers=identifiers)\n if not query:\n log.error('Insufficient metadata to construct query')\n return\n log('Using query URL:', query)\n try:\n raw = br.open(query, timeout=timeout).read().decode('utf-8')\n except Exception as e:\n log.exception('Failed to make identify query: %r'%query)\n return as_unicode(e)\n items = re.search(r'window[.]items\\s*=\\s*(.+?);', raw)\n if items is None:\n log.error('Failed to get list of matching items')\n log.debug('Response text:')\n log.debug(raw)\n return\n items = json.loads(items.group(1))\n\n if (not items and identifiers and title and authors and\n not abort.is_set()):\n return self.identify(log, result_queue, abort, title=title,\n authors=authors, timeout=timeout)\n\n if not items:\n return\n\n workers = []\n items = items[:5]\n for i, item in enumerate(get_basic_data(self.browser, log, *items)):\n sku = item['sku']\n for isbn in item['isbns']:\n self.cache_isbn_to_identifier(isbn, sku)\n if item['cover']:\n self.cache_identifier_to_cover_url(sku, item['cover'])\n fmt = item['format'].lower()\n if 'audio' in fmt or 'mp3' in fmt:\n continue # Audio-book, ignore\n workers.append(Worker(item, i, result_queue, br.clone_browser(), timeout, log, self))\n\n if not workers:\n return\n\n for w in workers:\n w.start()\n # Don't send all requests at the same time\n time.sleep(0.1)\n\n while not abort.is_set():\n a_worker_is_alive = False\n for w in workers:\n w.join(0.2)\n if abort.is_set():\n break\n if w.is_alive():\n a_worker_is_alive = True\n if not a_worker_is_alive:\n break\n\n # }}}\n\n def download_cover(self, log, result_queue, abort, # {{{\n title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n cached_url = self.get_cached_cover_url(identifiers)\n if cached_url is None:\n log.info('No cached cover found, running identify')\n rq = Queue()\n self.identify(log, rq, abort, title=title, authors=authors,\n identifiers=identifiers)\n if abort.is_set():\n return\n results = []\n while True:\n try:\n results.append(rq.get_nowait())\n except Empty:\n break\n results.sort(key=self.identify_results_keygen(\n title=title, authors=authors, identifiers=identifiers))\n for mi in results:\n cached_url = self.get_cached_cover_url(mi.identifiers)\n if cached_url is not None:\n break\n if cached_url is None:\n log.info('No cover found')\n return\n\n if abort.is_set():\n return\n br = self.browser\n log('Downloading cover from:', cached_url)\n try:\n cdata = br.open_novisit(cached_url, timeout=timeout).read()\n result_queue.put((self, cdata))\n except:\n log.exception('Failed to download cover from:', cached_url)\n # }}}\n\n\nif __name__ == '__main__':\n from calibre.ebooks.metadata.sources.test import (\n test_identify_plugin, title_test, authors_test, comments_test, pubdate_test)\n tests = [\n ( # A title and author search\n {'title': 'The Husband\\'s Secret', 'authors':['Liane Moriarty']},\n [title_test('The Husband\\'s Secret', exact=True),\n authors_test(['Liane Moriarty'])]\n ),\n\n ( # An isbn present in edelweiss\n {'identifiers':{'isbn': '9780312621360'}, },\n [title_test('Flame: A Sky Chasers Novel', exact=True),\n authors_test(['Amy Kathleen Ryan'])]\n ),\n\n # Multiple authors and two part title and no general description\n ({'identifiers':{'edelweiss':'0321180607'}},\n [title_test(\n \"XQuery From the Experts: A Guide to the W3C XML Query Language\"\n , exact=True), authors_test([\n 'Howard Katz', 'Don Chamberlin', 'Denise Draper', 'Mary Fernandez',\n 'Michael Kay', 'Jonathan Robie', 'Michael Rys', 'Jerome Simeon',\n 'Jim Tivy', 'Philip Wadler']), pubdate_test(2003, 8, 22),\n comments_test('Jérôme Siméon'), lambda mi: bool(mi.comments and 'No title summary' not in mi.comments)\n ]),\n ]\n start, stop = 0, len(tests)\n\n tests = tests[start:stop]\n test_identify_plugin(Edelweiss.name, tests)\n",
"google": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai\n# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>\nfrom __future__ import absolute_import, division, print_function, unicode_literals\nimport hashlib\nimport os\nimport re\nimport regex\nimport sys\nimport tempfile\nimport time\n\ntry:\n from queue import Empty, Queue\nexcept ImportError:\n from Queue import Empty, Queue\n\nfrom calibre import as_unicode, replace_entities, prepare_string_for_xml\nfrom calibre.ebooks.chardet import xml_to_unicode\nfrom calibre.ebooks.metadata import authors_to_string, check_isbn\nfrom calibre.ebooks.metadata.book.base import Metadata\nfrom calibre.ebooks.metadata.sources.base import Source\nfrom calibre.utils.cleantext import clean_ascii_chars\nfrom calibre.utils.localization import canonicalize_lang\n\nNAMESPACES = {\n 'openSearch': 'http://a9.com/-/spec/opensearchrss/1.0/',\n 'atom': 'http://www.w3.org/2005/Atom',\n 'dc': 'http://purl.org/dc/terms',\n 'gd': 'http://schemas.google.com/g/2005'\n}\n\n\ndef pretty_google_books_comments(raw):\n raw = replace_entities(raw)\n # Paragraphs in the comments are removed but whatever software googl uses\n # to do this does not insert a space so we often find the pattern\n # word.Capital in the comments which can be used to find paragraph markers.\n parts = []\n for x in re.split(r'([a-z)\"”])(\\.)([A-Z(\"“])', raw):\n if x == '.':\n parts.append('.</p>\\n\\n<p>')\n else:\n parts.append(prepare_string_for_xml(x))\n raw = '<p>' + ''.join(parts) + '</p>'\n return raw\n\n\ndef get_details(browser, url, timeout): # {{{\n try:\n raw = browser.open_novisit(url, timeout=timeout).read()\n except Exception as e:\n gc = getattr(e, 'getcode', lambda: -1)\n if gc() != 403:\n raise\n # Google is throttling us, wait a little\n time.sleep(2)\n raw = browser.open_novisit(url, timeout=timeout).read()\n\n return raw\n\n\n# }}}\n\nxpath_cache = {}\n\n\ndef XPath(x):\n ans = xpath_cache.get(x)\n if ans is None:\n from lxml import etree\n ans = xpath_cache[x] = etree.XPath(x, namespaces=NAMESPACES)\n return ans\n\n\ndef to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{\n from lxml import etree\n\n # total_results = XPath('//openSearch:totalResults')\n # start_index = XPath('//openSearch:startIndex')\n # items_per_page = XPath('//openSearch:itemsPerPage')\n entry = XPath('//atom:entry')\n entry_id = XPath('descendant::atom:id')\n url = XPath('descendant::atom:link[@rel=\"self\"]/@href')\n creator = XPath('descendant::dc:creator')\n identifier = XPath('descendant::dc:identifier')\n title = XPath('descendant::dc:title')\n date = XPath('descendant::dc:date')\n publisher = XPath('descendant::dc:publisher')\n subject = XPath('descendant::dc:subject')\n description = XPath('descendant::dc:description')\n language = XPath('descendant::dc:language')\n\n # print(etree.tostring(entry_, pretty_print=True))\n\n def get_text(extra, x):\n try:\n ans = x(extra)\n if ans:\n ans = ans[0].text\n if ans and ans.strip():\n return ans.strip()\n except:\n log.exception('Programming error:')\n return None\n\n def get_extra_details():\n raw = get_details(browser, details_url, timeout)\n if running_a_test:\n with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f:\n f.write(raw)\n print('Book details saved to:', f.name, file=sys.stderr)\n feed = etree.fromstring(\n xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],\n parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)\n )\n return entry(feed)[0]\n\n if isinstance(entry_, str):\n google_id = entry_\n details_url = 'https://www.google.com/books/feeds/volumes/' + google_id\n extra = get_extra_details()\n title_ = ': '.join([x.text for x in title(extra)]).strip()\n authors = [x.text.strip() for x in creator(extra) if x.text]\n else:\n id_url = entry_id(entry_)[0].text\n google_id = id_url.split('/')[-1]\n details_url = url(entry_)[0]\n title_ = ': '.join([x.text for x in title(entry_)]).strip()\n authors = [x.text.strip() for x in creator(entry_) if x.text]\n if not id_url or not title:\n # Silently discard this entry\n return None\n extra = None\n\n if not authors:\n authors = [_('Unknown')]\n if not title:\n return None\n if extra is None:\n extra = get_extra_details()\n mi = Metadata(title_, authors)\n mi.identifiers = {'google': google_id}\n mi.comments = get_text(extra, description)\n lang = canonicalize_lang(get_text(extra, language))\n if lang:\n mi.language = lang\n mi.publisher = get_text(extra, publisher)\n\n # ISBN\n isbns = []\n for x in identifier(extra):\n t = type('')(x.text).strip()\n if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):\n if t[:5].upper() == 'ISBN:':\n t = check_isbn(t[5:])\n if t:\n isbns.append(t)\n if isbns:\n mi.isbn = sorted(isbns, key=len)[-1]\n mi.all_isbns = isbns\n\n # Tags\n try:\n btags = [x.text for x in subject(extra) if x.text]\n tags = []\n for t in btags:\n atags = [y.strip() for y in t.split('/')]\n for tag in atags:\n if tag not in tags:\n tags.append(tag)\n except:\n log.exception('Failed to parse tags:')\n tags = []\n if tags:\n mi.tags = [x.replace(',', ';') for x in tags]\n\n # pubdate\n pubdate = get_text(extra, date)\n if pubdate:\n from calibre.utils.date import parse_date, utcnow\n try:\n default = utcnow().replace(day=15)\n mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)\n except:\n log.error('Failed to parse pubdate %r' % pubdate)\n\n # Cover\n mi.has_google_cover = None\n for x in extra.xpath(\n '//*[@href and @rel=\"http://schemas.google.com/books/2008/thumbnail\"]'\n ):\n mi.has_google_cover = x.get('href')\n break\n\n return mi\n\n\n# }}}\n\n\nclass GoogleBooks(Source):\n\n name = 'Google'\n version = (1, 1, 0)\n minimum_calibre_version = (2, 80, 0)\n description = _('Downloads metadata and covers from Google Books')\n\n capabilities = frozenset({'identify'})\n touched_fields = frozenset({\n 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',\n 'identifier:isbn', 'identifier:google', 'languages'\n })\n supports_gzip_transfer_encoding = True\n cached_cover_url_is_reliable = False\n\n GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1'\n\n DUMMY_IMAGE_MD5 = frozenset(\n ('0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f')\n )\n\n def get_book_url(self, identifiers): # {{{\n goog = identifiers.get('google', None)\n if goog is not None:\n return ('google', goog, 'https://books.google.com/books?id=%s' % goog)\n # }}}\n\n def id_from_url(self, url): # {{{\n from polyglot.urllib import parse_qs, urlparse\n purl = urlparse(url)\n if purl.netloc == 'books.google.com':\n q = parse_qs(purl.query)\n gid = q.get('id')\n if gid:\n return 'google', gid[0]\n # }}}\n\n def create_query(self, title=None, authors=None, identifiers={}, capitalize_isbn=False): # {{{\n try:\n from urllib.parse import urlencode\n except ImportError:\n from urllib import urlencode\n BASE_URL = 'https://books.google.com/books/feeds/volumes?'\n isbn = check_isbn(identifiers.get('isbn', None))\n q = ''\n if isbn is not None:\n q += ('ISBN:' if capitalize_isbn else 'isbn:') + isbn\n elif title or authors:\n\n def build_term(prefix, parts):\n return ' '.join('in' + prefix + ':' + x for x in parts)\n\n title_tokens = list(self.get_title_tokens(title))\n if title_tokens:\n q += build_term('title', title_tokens)\n author_tokens = list(self.get_author_tokens(authors, only_first_author=True))\n if author_tokens:\n q += ('+' if q else '') + build_term('author', author_tokens)\n\n if not q:\n return None\n if not isinstance(q, bytes):\n q = q.encode('utf-8')\n return BASE_URL + urlencode({\n 'q': q,\n 'max-results': 20,\n 'start-index': 1,\n 'min-viewability': 'none',\n })\n\n # }}}\n\n def download_cover( # {{{\n self,\n log,\n result_queue,\n abort,\n title=None,\n authors=None,\n identifiers={},\n timeout=30,\n get_best_cover=False\n ):\n cached_url = self.get_cached_cover_url(identifiers)\n if cached_url is None:\n log.info('No cached cover found, running identify')\n rq = Queue()\n self.identify(\n log,\n rq,\n abort,\n title=title,\n authors=authors,\n identifiers=identifiers\n )\n if abort.is_set():\n return\n results = []\n while True:\n try:\n results.append(rq.get_nowait())\n except Empty:\n break\n results.sort(\n key=self.identify_results_keygen(\n title=title, authors=authors, identifiers=identifiers\n )\n )\n for mi in results:\n cached_url = self.get_cached_cover_url(mi.identifiers)\n if cached_url is not None:\n break\n if cached_url is None:\n log.info('No cover found')\n return\n\n br = self.browser\n for candidate in (0, 1):\n if abort.is_set():\n return\n url = cached_url + '&zoom={}'.format(candidate)\n log('Downloading cover from:', cached_url)\n try:\n cdata = br.open_novisit(url, timeout=timeout).read()\n if cdata:\n if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5:\n log.warning('Google returned a dummy image, ignoring')\n else:\n result_queue.put((self, cdata))\n break\n except Exception:\n log.exception('Failed to download cover from:', cached_url)\n\n # }}}\n\n def get_cached_cover_url(self, identifiers): # {{{\n url = None\n goog = identifiers.get('google', None)\n if goog is None:\n isbn = identifiers.get('isbn', None)\n if isbn is not None:\n goog = self.cached_isbn_to_identifier(isbn)\n if goog is not None:\n url = self.cached_identifier_to_cover_url(goog)\n\n return url\n\n # }}}\n\n def postprocess_downloaded_google_metadata(self, ans, relevance=0): # {{{\n if not isinstance(ans, Metadata):\n return ans\n ans.source_relevance = relevance\n goog = ans.identifiers['google']\n for isbn in getattr(ans, 'all_isbns', []):\n self.cache_isbn_to_identifier(isbn, goog)\n if getattr(ans, 'has_google_cover', False):\n self.cache_identifier_to_cover_url(goog, self.GOOGLE_COVER % goog)\n if ans.comments:\n ans.comments = pretty_google_books_comments(ans.comments)\n self.clean_downloaded_metadata(ans)\n return ans\n # }}}\n\n def get_all_details( # {{{\n self,\n br,\n log,\n entries,\n abort,\n result_queue,\n timeout\n ):\n from lxml import etree\n for relevance, i in enumerate(entries):\n try:\n ans = self.postprocess_downloaded_google_metadata(to_metadata(br, log, i, timeout, self.running_a_test), relevance)\n if isinstance(ans, Metadata):\n result_queue.put(ans)\n except Exception:\n log.exception(\n 'Failed to get metadata for identify entry:', etree.tostring(i)\n )\n if abort.is_set():\n break\n\n # }}}\n\n def identify_via_web_search( # {{{\n self,\n log,\n result_queue,\n abort,\n title=None,\n authors=None,\n identifiers={},\n timeout=30\n ):\n isbn = check_isbn(identifiers.get('isbn', None))\n q = []\n strip_punc_pat = regex.compile(r'[\\p{C}|\\p{M}|\\p{P}|\\p{S}|\\p{Z}]+', regex.UNICODE)\n google_ids = []\n check_tokens = set()\n has_google_id = 'google' in identifiers\n\n def to_check_tokens(*tokens):\n for t in tokens:\n if len(t) < 3:\n continue\n t = t.lower()\n if t in ('and', 'not', 'the'):\n continue\n yield strip_punc_pat.sub('', t)\n\n if has_google_id:\n google_ids.append(identifiers['google'])\n elif isbn is not None:\n q.append(isbn)\n elif title or authors:\n title_tokens = list(self.get_title_tokens(title))\n if title_tokens:\n q += title_tokens\n check_tokens |= set(to_check_tokens(*title_tokens))\n author_tokens = list(self.get_author_tokens(authors, only_first_author=True))\n if author_tokens:\n q += author_tokens\n check_tokens |= set(to_check_tokens(*author_tokens))\n if not q and not google_ids:\n return None\n from calibre.ebooks.metadata.sources.update import search_engines_module\n se = search_engines_module()\n br = se.google_specialize_browser(se.browser())\n if not has_google_id:\n url = se.google_format_query(q, tbm='bks')\n log('Making query:', url)\n r = []\n root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append)\n pat = re.compile(r'id=([^&]+)')\n for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False):\n m = pat.search(q.url)\n if m is None or not q.url.startswith('https://books.google'):\n continue\n google_ids.append(m.group(1))\n\n if not google_ids and isbn and (title or authors):\n return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)\n found = False\n seen = set()\n for relevance, gid in enumerate(google_ids):\n if gid in seen:\n continue\n seen.add(gid)\n try:\n ans = to_metadata(br, log, gid, timeout, self.running_a_test)\n if isinstance(ans, Metadata):\n if isbn:\n if isbn not in ans.all_isbns:\n log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the ISBN:', isbn,\n 'not in', ' '.join(ans.all_isbns))\n continue\n elif check_tokens:\n candidate = set(to_check_tokens(*self.get_title_tokens(ans.title)))\n candidate |= set(to_check_tokens(*self.get_author_tokens(ans.authors)))\n if candidate.intersection(check_tokens) != check_tokens:\n log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the query')\n continue\n ans = self.postprocess_downloaded_google_metadata(ans, relevance)\n result_queue.put(ans)\n found = True\n except:\n log.exception('Failed to get metadata for google books id:', gid)\n if abort.is_set():\n break\n if not found and isbn and (title or authors):\n return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)\n # }}}\n\n def identify( # {{{\n self,\n log,\n result_queue,\n abort,\n title=None,\n authors=None,\n identifiers={},\n timeout=30\n ):\n from lxml import etree\n entry = XPath('//atom:entry')\n identifiers = identifiers.copy()\n br = self.browser\n if 'google' in identifiers:\n try:\n ans = to_metadata(br, log, identifiers['google'], timeout, self.running_a_test)\n if isinstance(ans, Metadata):\n self.postprocess_downloaded_google_metadata(ans)\n result_queue.put(ans)\n return\n except Exception:\n log.exception('Failed to get metadata for Google identifier:', identifiers['google'])\n del identifiers['google']\n\n query = self.create_query(\n title=title, authors=authors, identifiers=identifiers\n )\n if not query:\n log.error('Insufficient metadata to construct query')\n return\n\n def make_query(query):\n log('Making query:', query)\n try:\n raw = br.open_novisit(query, timeout=timeout).read()\n except Exception as e:\n log.exception('Failed to make identify query: %r' % query)\n return False, as_unicode(e)\n\n try:\n feed = etree.fromstring(\n xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],\n parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)\n )\n return True, entry(feed)\n except Exception as e:\n log.exception('Failed to parse identify results')\n return False, as_unicode(e)\n ok, entries = make_query(query)\n if not ok:\n return entries\n if not entries and not abort.is_set():\n log('No results found, doing a web search instead')\n return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout)\n\n # There is no point running these queries in threads as google\n # throttles requests returning 403 Forbidden errors\n self.get_all_details(br, log, entries, abort, result_queue, timeout)\n\n # }}}\n\n\nif __name__ == '__main__': # tests {{{\n # To run these test use:\n # calibre-debug src/calibre/ebooks/metadata/sources/google.py\n from calibre.ebooks.metadata.sources.test import (\n authors_test, test_identify_plugin, title_test\n )\n tests = [\n ({\n 'identifiers': {'google': 's7NIrgEACAAJ'},\n }, [title_test('Ride Every Stride', exact=False)]),\n\n ({\n 'identifiers': {'isbn': '0743273567'},\n 'title': 'Great Gatsby',\n 'authors': ['Fitzgerald']\n }, [\n title_test('The great gatsby', exact=True),\n authors_test(['F. Scott Fitzgerald'])\n ]),\n\n ({\n 'title': 'Flatland',\n 'authors': ['Abbott']\n }, [title_test('Flatland', exact=False)]),\n\n ({\n 'title': 'The Blood Red Indian Summer: A Berger and Mitry Mystery',\n 'authors': ['David Handler'],\n }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')\n ]),\n\n ({\n # requires using web search to find the book\n 'title': 'Dragon Done It',\n 'authors': ['Eric Flint'],\n }, [\n title_test('The dragon done it', exact=True),\n authors_test(['Eric Flint', 'Mike Resnick'])\n ]),\n\n ]\n test_identify_plugin(GoogleBooks.name, tests[:])\n\n# }}}\n",
"google_images": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__ = 'GPL v3'\n__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nfrom collections import OrderedDict\n\nfrom calibre import random_user_agent\nfrom calibre.ebooks.metadata.sources.base import Source, Option\n\n\ndef parse_html(raw):\n try:\n from html5_parser import parse\n except ImportError:\n # Old versions of calibre\n import html5lib\n return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)\n else:\n return parse(raw)\n\n\ndef imgurl_from_id(raw, tbnid):\n from json import JSONDecoder\n q = '\"{}\",['.format(tbnid)\n start_pos = raw.index(q)\n if start_pos < 100:\n return\n jd = JSONDecoder()\n data = jd.raw_decode('[' + raw[start_pos:])[0]\n # from pprint import pprint\n # pprint(data)\n url_num = 0\n for x in data:\n if isinstance(x, list) and len(x) == 3:\n q = x[0]\n if hasattr(q, 'lower') and q.lower().startswith('http'):\n url_num += 1\n if url_num > 1:\n return q\n\n\nclass GoogleImages(Source):\n\n name = 'Google Images'\n version = (1, 0, 3)\n minimum_calibre_version = (2, 80, 0)\n description = _('Downloads covers from a Google Image search. Useful to find larger/alternate covers.')\n capabilities = frozenset(['cover'])\n can_get_multiple_covers = True\n supports_gzip_transfer_encoding = True\n options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),\n _('The maximum number of covers to process from the Google search result')),\n Option('size', 'choices', 'svga', _('Cover size'),\n _('Search for covers larger than the specified size'),\n choices=OrderedDict((\n ('any', _('Any size'),),\n ('l', _('Large'),),\n ('qsvga', _('Larger than %s')%'400x300',),\n ('vga', _('Larger than %s')%'640x480',),\n ('svga', _('Larger than %s')%'600x800',),\n ('xga', _('Larger than %s')%'1024x768',),\n ('2mp', _('Larger than %s')%'2 MP',),\n ('4mp', _('Larger than %s')%'4 MP',),\n ))),\n )\n\n def download_cover(self, log, result_queue, abort,\n title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n if not title:\n return\n timeout = max(60, timeout) # Needs at least a minute\n title = ' '.join(self.get_title_tokens(title))\n author = ' '.join(self.get_author_tokens(authors))\n urls = self.get_image_urls(title, author, log, abort, timeout)\n self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)\n\n @property\n def user_agent(self):\n return random_user_agent(allow_ie=False)\n\n def get_image_urls(self, title, author, log, abort, timeout):\n from calibre.utils.cleantext import clean_ascii_chars\n try:\n from urllib.parse import urlencode\n except ImportError:\n from urllib import urlencode\n from collections import OrderedDict\n ans = OrderedDict()\n br = self.browser\n q = urlencode({'as_q': ('%s %s'%(title, author)).encode('utf-8')})\n if isinstance(q, bytes):\n q = q.decode('utf-8')\n sz = self.prefs['size']\n if sz == 'any':\n sz = ''\n elif sz == 'l':\n sz = 'isz:l,'\n else:\n sz = 'isz:lt,islt:%s,' % sz\n # See https://www.google.com/advanced_image_search to understand this\n # URL scheme\n url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz)\n log('Search URL: ' + url)\n br.set_simple_cookie('CONSENT', 'YES+', '.google.com', path='/')\n raw = clean_ascii_chars(br.open(url).read().decode('utf-8'))\n root = parse_html(raw)\n results = root.xpath('//div/@data-tbnid') # could also use data-id\n # from calibre.utils.ipython import ipython\n # ipython({'root': root, 'raw': raw, 'url': url, 'results': results})\n for tbnid in results:\n try:\n imgurl = imgurl_from_id(raw, tbnid)\n except Exception:\n continue\n if imgurl:\n ans[imgurl] = True\n return list(ans)\n\n\ndef test():\n try:\n from queue import Queue\n except ImportError:\n from Queue import Queue\n from threading import Event\n from calibre.utils.logging import default_log\n p = GoogleImages(None)\n p.log = default_log\n rq = Queue()\n p.download_cover(default_log, rq, Event(), title='The Heroes',\n authors=('Joe Abercrombie',))\n print('Downloaded', rq.qsize(), 'covers')\n\n\nif __name__ == '__main__':\n test()\n",
"hashes": {
"amazon": "e2dc988ace313e5689de47912c4c1446e2311868",
"big_book_search": "9b468ff223b7ddbf6bbd26e58194c0fa8fbbfb69",
"edelweiss": "efd8f281a56435895b7916f85598d469bb53bb02",
"google": "bfe34ba5e1be36511f2228a350bd91c676724aac",
"google_images": "3323061049afe11a42422f325783de220991149d",
"openlibrary": "8707d3b3161de476b46ed967dea1116707dcfb0a",
"search_engines": "52a24c5004033fc9f393a260a09e6f8afb80b0ba"
},
"openlibrary": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__ = 'GPL v3'\n__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nfrom calibre.ebooks.metadata.sources.base import Source\n\n\nclass OpenLibrary(Source):\n\n name = 'Open Library'\n version = (1, 0, 0)\n minimum_calibre_version = (2, 80, 0)\n description = _('Downloads covers from The Open Library')\n\n capabilities = frozenset(['cover'])\n\n OPENLIBRARY = 'https://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'\n\n def download_cover(self, log, result_queue, abort,\n title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n if 'isbn' not in identifiers:\n return\n isbn = identifiers['isbn']\n br = self.browser\n try:\n ans = br.open_novisit(self.OPENLIBRARY%isbn, timeout=timeout).read()\n result_queue.put((self, ans))\n except Exception as e:\n if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:\n log.error('No cover for ISBN: %r found'%isbn)\n else:\n log.exception('Failed to download cover for ISBN:', isbn)\n",
"search_engines": "#!/usr/bin/env python\n# vim:fileencoding=utf-8\n# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>\n\nfrom __future__ import absolute_import, division, print_function, unicode_literals\nimport json\nimport os\nimport re\nimport time\nfrom collections import namedtuple\nfrom contextlib import contextmanager\nfrom threading import Lock\n\ntry:\n from urllib.parse import parse_qs, quote_plus, unquote, urlencode, quote\nexcept ImportError:\n from urlparse import parse_qs\n from urllib import quote_plus, urlencode, unquote, quote\n\nfrom lxml import etree\n\nfrom calibre import browser as _browser, prints, random_user_agent\nfrom calibre.constants import cache_dir\nfrom calibre.ebooks.chardet import xml_to_unicode\nfrom calibre.utils.lock import ExclusiveFile\nfrom calibre.utils.random_ua import accept_header_for_ua\n\ncurrent_version = (1, 2, 0)\nminimum_calibre_version = (2, 80, 0)\nwebcache = {}\nwebcache_lock = Lock()\n\n\nResult = namedtuple('Result', 'url title cached_url')\n\n\n@contextmanager\ndef rate_limit(name='test', time_between_visits=2, max_wait_seconds=5 * 60, sleep_time=0.2):\n lock_file = os.path.join(cache_dir(), 'search-engine.' + name + '.lock')\n with ExclusiveFile(lock_file, timeout=max_wait_seconds, sleep_time=sleep_time) as f:\n try:\n lv = float(f.read().decode('utf-8').strip())\n except Exception:\n lv = 0\n # we cannot use monotonic() as this is cross process and historical\n # data as well\n delta = time.time() - lv\n if delta < time_between_visits:\n time.sleep(time_between_visits - delta)\n try:\n yield\n finally:\n f.seek(0)\n f.truncate()\n f.write(repr(time.time()).encode('utf-8'))\n\n\ndef tostring(elem):\n return etree.tostring(elem, encoding='unicode', method='text', with_tail=False)\n\n\ndef browser():\n ua = random_user_agent(allow_ie=False)\n # ua = 'Mozilla/5.0 (Linux; Android 8.0.0; VTR-L29; rv:63.0) Gecko/20100101 Firefox/63.0'\n br = _browser(user_agent=ua)\n br.set_handle_gzip(True)\n br.addheaders += [\n ('Accept', accept_header_for_ua(ua)),\n ('Upgrade-insecure-requests', '1'),\n ]\n return br\n\n\ndef encode_query(**query):\n q = {k.encode('utf-8'): v.encode('utf-8') for k, v in query.items()}\n return urlencode(q).decode('utf-8')\n\n\ndef parse_html(raw):\n try:\n from html5_parser import parse\n except ImportError:\n # Old versions of calibre\n import html5lib\n return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)\n else:\n return parse(raw)\n\n\ndef query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None, simple_scraper=None):\n with rate_limit(key):\n if simple_scraper is None:\n raw = br.open_novisit(url, timeout=timeout).read()\n raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]\n else:\n raw = simple_scraper(url, timeout=timeout)\n if dump_raw is not None:\n with open(dump_raw, 'w') as f:\n f.write(raw)\n if save_raw is not None:\n save_raw(raw)\n return parser(raw)\n\n\ndef quote_term(x):\n ans = quote_plus(x.encode('utf-8'))\n if isinstance(ans, bytes):\n ans = ans.decode('utf-8')\n return ans\n\n\n# DDG + Wayback machine {{{\n\n\ndef ddg_url_processor(url):\n return url\n\n\ndef ddg_term(t):\n t = t.replace('\"', '')\n if t.lower() in {'map', 'news'}:\n t = '\"' + t + '\"'\n if t in {'OR', 'AND', 'NOT'}:\n t = t.lower()\n return t\n\n\ndef ddg_href(url):\n if url.startswith('/'):\n q = url.partition('?')[2]\n url = parse_qs(q.encode('utf-8'))['uddg'][0].decode('utf-8')\n return url\n\n\ndef wayback_machine_cached_url(url, br=None, log=prints, timeout=60):\n q = quote_term(url)\n br = br or browser()\n data = query(br, 'https://archive.org/wayback/available?url=' +\n q, 'wayback', parser=json.loads, limit=0.25, timeout=timeout)\n try:\n closest = data['archived_snapshots']['closest']\n if closest['available']:\n return closest['url'].replace('http:', 'https:')\n except Exception:\n pass\n from pprint import pformat\n log('Response from wayback machine:', pformat(data))\n\n\ndef wayback_url_processor(url):\n if url.startswith('/'):\n # Use original URL instead of absolutizing to wayback URL as wayback is\n # slow\n m = re.search('https?:', url)\n if m is None:\n url = 'https://web.archive.org' + url\n else:\n url = url[m.start():]\n return url\n\n\ndef ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):\n # https://duck.co/help/results/syntax\n terms = [quote_term(ddg_term(t)) for t in terms]\n if site is not None:\n terms.append(quote_term(('site:' + site)))\n q = '+'.join(terms)\n url = 'https://duckduckgo.com/html/?q={q}&kp={kp}'.format(\n q=q, kp=1 if safe_search else -1)\n log('Making ddg query: ' + url)\n br = br or browser()\n root = query(br, url, 'ddg', dump_raw, timeout=timeout)\n ans = []\n for a in root.xpath('//*[@class=\"results\"]//*[@class=\"result__title\"]/a[@href and @class=\"result__a\"]'):\n try:\n ans.append(Result(ddg_href(a.get('href')), tostring(a), None))\n except KeyError:\n log('Failed to find ddg href in:', a.get('href'))\n return ans, url\n\n\ndef ddg_develop():\n br = browser()\n for result in ddg_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]:\n if '/dp/' in result.url:\n print(result.title)\n print(' ', result.url)\n print(' ', get_cached_url(result.url, br))\n print()\n# }}}\n\n# Bing {{{\n\n\ndef bing_term(t):\n t = t.replace('\"', '')\n if t in {'OR', 'AND', 'NOT'}:\n t = t.lower()\n return t\n\n\ndef bing_url_processor(url):\n return url\n\n\ndef bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60, show_user_agent=False):\n # http://vlaurie.com/computers2/Articles/bing_advanced_search.htm\n terms = [quote_term(bing_term(t)) for t in terms]\n if site is not None:\n terms.append(quote_term(('site:' + site)))\n q = '+'.join(terms)\n url = 'https://www.bing.com/search?q={q}'.format(q=q)\n log('Making bing query: ' + url)\n br = br or browser()\n br.addheaders = [x for x in br.addheaders if x[0].lower() != 'user-agent']\n ua = ''\n from calibre.utils.random_ua import random_common_chrome_user_agent\n while not ua or 'Edg/' in ua:\n ua = random_common_chrome_user_agent()\n if show_user_agent:\n print('User-agent:', ua)\n br.addheaders.append(('User-agent', ua))\n\n root = query(br, url, 'bing', dump_raw, timeout=timeout)\n ans = []\n for li in root.xpath('//*[@id=\"b_results\"]/li[@class=\"b_algo\"]'):\n a = li.xpath('descendant::h2/a[@href]') or li.xpath('descendant::div[@class=\"b_algoheader\"]/a[@href]')\n a = a[0]\n title = tostring(a)\n try:\n div = li.xpath('descendant::div[@class=\"b_attribution\" and @u]')[0]\n except IndexError:\n log('Ignoring {!r} as it has no cached page'.format(title))\n continue\n d, w = div.get('u').split('|')[-2:]\n cached_url = 'https://cc.bingj.com/cache.aspx?q={q}&d={d}&mkt=en-US&setlang=en-US&w={w}'.format(\n q=q, d=d, w=w)\n ans.append(Result(a.get('href'), title, cached_url))\n if not ans:\n title = ' '.join(root.xpath('//title/text()'))\n log('Failed to find any results on results page, with title:', title)\n return ans, url\n\n\ndef bing_develop():\n for result in bing_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', show_user_agent=True)[0]:\n if '/dp/' in result.url:\n print(result.title)\n print(' ', result.url)\n print(' ', result.cached_url)\n print()\n# }}}\n\n# Google {{{\n\n\ndef google_term(t):\n t = t.replace('\"', '')\n if t in {'OR', 'AND', 'NOT'}:\n t = t.lower()\n return t\n\n\ndef google_url_processor(url):\n return url\n\n\ndef google_get_cached_url(url, br=None, log=prints, timeout=60):\n ourl = url\n if not isinstance(url, bytes):\n url = url.encode('utf-8')\n cu = quote(url, safe='')\n if isinstance(cu, bytes):\n cu = cu.decode('utf-8')\n cached_url = 'https://webcache.googleusercontent.com/search?q=cache:' + cu\n br = google_specialize_browser(br or browser())\n try:\n raw = query(br, cached_url, 'google-cache', parser=lambda x: x.encode('utf-8'), timeout=timeout)\n except Exception as err:\n log('Failed to get cached URL from google for URL: {} with error: {}'.format(ourl, err))\n else:\n with webcache_lock:\n webcache[cached_url] = raw\n return cached_url\n\n\ndef google_extract_cache_urls(raw):\n if isinstance(raw, bytes):\n raw = raw.decode('utf-8', 'replace')\n pat = re.compile(r'\\\\x22(https://webcache\\.googleusercontent\\.com/.+?)\\\\x22')\n upat = re.compile(r'\\\\\\\\u([0-9a-fA-F]{4})')\n cache_pat = re.compile('cache:([^:]+):(.+)')\n\n def urepl(m):\n return chr(int(m.group(1), 16))\n\n seen = set()\n ans = {}\n for m in pat.finditer(raw):\n cache_url = upat.sub(urepl, m.group(1))\n m = cache_pat.search(cache_url)\n cache_id, src_url = m.group(1), m.group(2)\n if cache_id in seen:\n continue\n seen.add(cache_id)\n src_url = src_url.split('+')[0]\n src_url = unquote(src_url)\n ans[src_url] = cache_url\n return ans\n\n\ndef google_parse_results(root, raw, log=prints, ignore_uncached=True):\n cache_url_map = google_extract_cache_urls(raw)\n # print('\\n'.join(cache_url_map))\n ans = []\n for div in root.xpath('//*[@id=\"search\"]//*[@id=\"rso\"]//div[descendant::h3]'):\n try:\n a = div.xpath('descendant::a[@href]')[0]\n except IndexError:\n log('Ignoring div with no main result link')\n continue\n title = tostring(a)\n src_url = a.get('href')\n if src_url in cache_url_map:\n cached_url = cache_url_map[src_url]\n else:\n try:\n c = div.xpath('descendant::*[@role=\"menuitem\"]//a[@class=\"fl\"]')[0]\n except IndexError:\n if ignore_uncached:\n log('Ignoring {!r} as it has no cached page'.format(title))\n continue\n c = {'href': ''}\n cached_url = c.get('href')\n ans.append(Result(a.get('href'), title, cached_url))\n if not ans:\n title = ' '.join(root.xpath('//title/text()'))\n log('Failed to find any results on results page, with title:', title)\n return ans\n\n\ndef google_specialize_browser(br):\n with webcache_lock:\n if not hasattr(br, 'google_consent_cookie_added'):\n br.set_simple_cookie('CONSENT', 'YES+', '.google.com', path='/')\n br.google_consent_cookie_added = True\n return br\n\n\ndef google_format_query(terms, site=None, tbm=None):\n terms = [quote_term(google_term(t)) for t in terms]\n if site is not None:\n terms.append(quote_term(('site:' + site)))\n q = '+'.join(terms)\n url = 'https://www.google.com/search?q={q}'.format(q=q)\n if tbm:\n url += '&tbm=' + tbm\n return url\n\n\ndef google_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):\n url = google_format_query(terms, site)\n log('Making google query: ' + url)\n br = google_specialize_browser(br or browser())\n r = []\n root = query(br, url, 'google', dump_raw, timeout=timeout, save_raw=r.append)\n return google_parse_results(root, r[0], log=log), url\n\n\ndef google_develop(search_terms='1423146786', raw_from=''):\n if raw_from:\n with open(raw_from, 'rb') as f:\n raw = f.read()\n results = google_parse_results(parse_html(raw), raw)\n else:\n br = browser()\n results = google_search(search_terms.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]\n for result in results:\n if '/dp/' in result.url:\n print(result.title)\n print(' ', result.url)\n print(' ', result.cached_url)\n print()\n# }}}\n\n\ndef get_cached_url(url, br=None, log=prints, timeout=60):\n return google_get_cached_url(url, br, log, timeout) or wayback_machine_cached_url(url, br, log, timeout)\n\n\ndef get_data_for_cached_url(url):\n with webcache_lock:\n return webcache.get(url)\n\n\ndef resolve_url(url):\n prefix, rest = url.partition(':')[::2]\n if prefix == 'bing':\n return bing_url_processor(rest)\n if prefix == 'wayback':\n return wayback_url_processor(rest)\n return url\n\n\n# if __name__ == '__main__':\n# import sys\n# func = sys.argv[-1]\n# globals()[func]()\n"
}