18 lines
139 KiB
JSON
18 lines
139 KiB
JSON
{
|
||
"amazon": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai\n# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport re\nimport socket\nimport string\nimport time\nfrom functools import partial\n\ntry:\n from queue import Empty, Queue\nexcept ImportError:\n from Queue import Empty, Queue\n\nfrom threading import Thread\n\ntry:\n from urllib.parse import urlparse\nexcept ImportError:\n from urlparse import urlparse\n\nfrom mechanize import HTTPError\n\nfrom calibre import as_unicode, browser, random_user_agent, xml_replace_entities\nfrom calibre.ebooks.metadata import check_isbn\nfrom calibre.ebooks.metadata.book.base import Metadata\nfrom calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase\nfrom calibre.ebooks.oeb.base import urlquote\nfrom calibre.utils.icu import lower as icu_lower\nfrom calibre.utils.localization import canonicalize_lang\nfrom calibre.utils.random_ua import accept_header_for_ua\n\n\ndef sort_matches_preferring_kindle_editions(matches):\n upos_map = {url:i for i, url in enumerate(matches)}\n\n def skey(url):\n opos = upos_map[url]\n parts = url.split('/')\n try:\n idx = parts.index('dp')\n except Exception:\n idx = -1\n if idx < 0 or idx + 1 >= len(parts) or not parts[idx+1].startswith('B'):\n return 1, opos\n return 0, opos\n matches.sort(key=skey)\n return matches\n\n\ndef iri_quote_plus(url):\n ans = urlquote(url)\n if isinstance(ans, bytes):\n ans = ans.decode('utf-8')\n return ans.replace('%20', '+')\n\n\ndef user_agent_is_ok(ua):\n return 'Mobile/' not in ua and 'Mobile ' not in ua\n\n\nclass CaptchaError(Exception):\n pass\n\n\nclass SearchFailed(ValueError):\n pass\n\n\nclass UrlNotFound(ValueError):\n\n def __init__(self, url):\n ValueError.__init__(self, 'The URL {} was not found (HTTP 404)'.format(url))\n\n\nclass UrlTimedOut(ValueError):\n\n def __init__(self, url):\n ValueError.__init__(self, 'Timed out fetching {} try again later'.format(url))\n\n\ndef parse_html(raw):\n try:\n from html5_parser import parse\n except ImportError:\n # Old versions of calibre\n import html5lib\n return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)\n else:\n return parse(raw)\n\n\ndef parse_details_page(url, log, timeout, browser, domain):\n from lxml.html import tostring\n\n from calibre.ebooks.chardet import xml_to_unicode\n from calibre.utils.cleantext import clean_ascii_chars\n try:\n from calibre.ebooks.metadata.sources.update import search_engines_module\n get_data_for_cached_url = search_engines_module().get_data_for_cached_url\n except Exception:\n def get_data_for_cached_url(*a):\n return None\n raw = get_data_for_cached_url(url)\n if raw:\n log('Using cached details for url:', url)\n else:\n log('Downloading details from:', url)\n try:\n raw = browser.open_novisit(url, timeout=timeout).read().strip()\n except Exception as e:\n if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:\n log.error('URL not found: %r' % url)\n raise UrlNotFound(url)\n attr = getattr(e, 'args', [None])\n attr = attr if attr else [None]\n if isinstance(attr[0], socket.timeout):\n msg = 'Details page timed out. Try again later.'\n log.error(msg)\n raise UrlTimedOut(url)\n else:\n msg = 'Failed to make details query: %r' % url\n log.exception(msg)\n raise ValueError('Could not make details query for {}'.format(url))\n\n oraw = raw\n if 'amazon.com.br' in url:\n # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag\n raw = raw.decode('utf-8')\n raw = xml_to_unicode(raw, strip_encoding_pats=True,\n resolve_entities=True)[0]\n if '<title>404 - ' in raw:\n raise ValueError('Got a 404 page for: %r' % url)\n if '>Could not find the requested document in the cache.<' in raw:\n raise ValueError('No cached entry for %s found' % url)\n\n try:\n root = parse_html(clean_ascii_chars(raw))\n except Exception:\n msg = 'Failed to parse amazon details page: %r' % url\n log.exception(msg)\n raise ValueError(msg)\n if domain == 'jp':\n for a in root.xpath('//a[@href]'):\n if ('black-curtain-redirect.html' in a.get('href')) or ('/black-curtain/save-eligibility/black-curtain' in a.get('href')):\n url = a.get('href')\n if url:\n if url.startswith('/'):\n url = 'https://amazon.co.jp' + a.get('href')\n log('Black curtain redirect found, following')\n return parse_details_page(url, log, timeout, browser, domain)\n\n errmsg = root.xpath('//*[@id=\"errorMessage\"]')\n if errmsg:\n msg = 'Failed to parse amazon details page: %r' % url\n msg += tostring(errmsg, method='text', encoding='unicode').strip()\n log.error(msg)\n raise ValueError(msg)\n\n from css_selectors import Select\n selector = Select(root)\n return oraw, root, selector\n\n\ndef parse_asin(root, log, url):\n try:\n link = root.xpath('//link[@rel=\"canonical\" and @href]')\n for l in link:\n return l.get('href').rpartition('/')[-1]\n except Exception:\n log.exception('Error parsing ASIN for url: %r' % url)\n\n\nclass Worker(Thread): # Get details {{{\n\n '''\n Get book details from amazons book page in a separate thread\n '''\n\n def __init__(self, url, result_queue, browser, log, relevance, domain,\n plugin, timeout=20, testing=False, preparsed_root=None,\n cover_url_processor=None, filter_result=None):\n Thread.__init__(self)\n self.cover_url_processor = cover_url_processor\n self.preparsed_root = preparsed_root\n self.daemon = True\n self.testing = testing\n self.url, self.result_queue = url, result_queue\n self.log, self.timeout = log, timeout\n self.filter_result = filter_result or (lambda x, log: True)\n self.relevance, self.plugin = relevance, plugin\n self.browser = browser\n self.cover_url = self.amazon_id = self.isbn = None\n self.domain = domain\n from lxml.html import tostring\n self.tostring = tostring\n\n months = { # {{{\n 'de': {\n 1: ['jän', 'januar'],\n 2: ['februar'],\n 3: ['märz'],\n 5: ['mai'],\n 6: ['juni'],\n 7: ['juli'],\n 10: ['okt', 'oktober'],\n 12: ['dez', 'dezember']\n },\n 'it': {\n 1: ['gennaio', 'enn'],\n 2: ['febbraio', 'febbr'],\n 3: ['marzo'],\n 4: ['aprile'],\n 5: ['maggio', 'magg'],\n 6: ['giugno'],\n 7: ['luglio'],\n 8: ['agosto', 'ag'],\n 9: ['settembre', 'sett'],\n 10: ['ottobre', 'ott'],\n 11: ['novembre'],\n 12: ['dicembre', 'dic'],\n },\n 'fr': {\n 1: ['janv'],\n 2: ['févr'],\n 3: ['mars'],\n 4: ['avril'],\n 5: ['mai'],\n 6: ['juin'],\n 7: ['juil'],\n 8: ['août'],\n 9: ['sept'],\n 10: ['oct', 'octobre'],\n 11: ['nov', 'novembre'],\n 12: ['déc', 'décembre'],\n },\n 'br': {\n 1: ['janeiro'],\n 2: ['fevereiro'],\n 3: ['março'],\n 4: ['abril'],\n 5: ['maio'],\n 6: ['junho'],\n 7: ['julho'],\n 8: ['agosto'],\n 9: ['setembro'],\n 10: ['outubro'],\n 11: ['novembro'],\n 12: ['dezembro'],\n },\n 'es': {\n 1: ['enero'],\n 2: ['febrero'],\n 3: ['marzo'],\n 4: ['abril'],\n 5: ['mayo'],\n 6: ['junio'],\n 7: ['julio'],\n 8: ['agosto'],\n 9: ['septiembre', 'setiembre'],\n 10: ['octubre'],\n 11: ['noviembre'],\n 12: ['diciembre'],\n },\n 'se': {\n 1: ['januari'],\n 2: ['februari'],\n 3: ['mars'],\n 4: ['april'],\n 5: ['maj'],\n 6: ['juni'],\n 7: ['juli'],\n 8: ['augusti'],\n 9: ['september'],\n 10: ['oktober'],\n 11: ['november'],\n 12: ['december'],\n },\n 'jp': {\n 1: ['1月'],\n 2: ['2月'],\n 3: ['3月'],\n 4: ['4月'],\n 5: ['5月'],\n 6: ['6月'],\n 7: ['7月'],\n 8: ['8月'],\n 9: ['9月'],\n 10: ['10月'],\n 11: ['11月'],\n 12: ['12月'],\n },\n 'nl': {\n 1: ['januari'], 2: ['februari'], 3: ['maart'], 5: ['mei'], 6: ['juni'], 7: ['juli'], 8: ['augustus'], 10: ['oktober'],\n }\n\n } # }}}\n\n self.english_months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',\n 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']\n self.months = months.get(self.domain, {})\n\n self.pd_xpath = '''\n //h2[text()=\"Product Details\" or \\\n text()=\"Produktinformation\" or \\\n text()=\"Dettagli prodotto\" or \\\n text()=\"Product details\" or \\\n text()=\"Détails sur le produit\" or \\\n text()=\"Detalles del producto\" or \\\n text()=\"Detalhes do produto\" or \\\n text()=\"Productgegevens\" or \\\n text()=\"基本信息\" or \\\n starts-with(text(), \"登録情報\")]/../div[@class=\"content\"]\n '''\n # Editor: is for Spanish\n self.publisher_xpath = '''\n descendant::*[starts-with(text(), \"Publisher:\") or \\\n starts-with(text(), \"Verlag:\") or \\\n starts-with(text(), \"Editore:\") or \\\n starts-with(text(), \"Editeur\") or \\\n starts-with(text(), \"Editor:\") or \\\n starts-with(text(), \"Editora:\") or \\\n starts-with(text(), \"Uitgever:\") or \\\n starts-with(text(), \"Utgivare:\") or \\\n starts-with(text(), \"出版社:\")]\n '''\n self.pubdate_xpath = '''\n descendant::*[starts-with(text(), \"Publication Date:\") or \\\n starts-with(text(), \"Audible.com Release Date:\")]\n '''\n self.publisher_names = {'Publisher', 'Uitgever', 'Verlag', 'Utgivare', 'Herausgeber',\n 'Editore', 'Editeur', 'Éditeur', 'Editor', 'Editora', '出版社'}\n\n self.language_xpath = '''\n descendant::*[\n starts-with(text(), \"Language:\") \\\n or text() = \"Language\" \\\n or text() = \"Sprache:\" \\\n or text() = \"Lingua:\" \\\n or text() = \"Idioma:\" \\\n or starts-with(text(), \"Langue\") \\\n or starts-with(text(), \"言語\") \\\n or starts-with(text(), \"Språk\") \\\n or starts-with(text(), \"语种\")\n ]\n '''\n self.language_names = {'Language', 'Sprache', 'Språk',\n 'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'}\n\n self.tags_xpath = '''\n descendant::h2[\n text() = \"Look for Similar Items by Category\" or\n text() = \"Ähnliche Artikel finden\" or\n text() = \"Buscar productos similares por categoría\" or\n text() = \"Ricerca articoli simili per categoria\" or\n text() = \"Rechercher des articles similaires par rubrique\" or\n text() = \"Procure por items similares por categoria\" or\n text() = \"関連商品を探す\"\n ]/../descendant::ul/li\n '''\n\n self.ratings_pat = re.compile(\n r'([0-9.,]+) ?(out of|von|van|su|étoiles sur|つ星のうち|de un máximo de|de|av) '\n r'([\\d\\.]+)( (stars|Sternen|stelle|estrellas|estrelas|sterren|stjärnor)){0,1}'\n )\n self.ratings_pat_cn = re.compile('([0-9.]+) 颗星,最多 5 颗星')\n self.ratings_pat_jp = re.compile(r'\\d+つ星のうち([\\d\\.]+)')\n\n lm = {\n 'eng': ('English', 'Englisch', 'Engels', 'Engelska'),\n 'fra': ('French', 'Français'),\n 'ita': ('Italian', 'Italiano'),\n 'deu': ('German', 'Deutsch'),\n 'spa': ('Spanish', 'Espa\\xf1ol', 'Espaniol'),\n 'jpn': ('Japanese', '日本語'),\n 'por': ('Portuguese', 'Português'),\n 'nld': ('Dutch', 'Nederlands',),\n 'chs': ('Chinese', '中文', '简体中文'),\n 'swe': ('Swedish', 'Svenska'),\n }\n self.lang_map = {}\n for code, names in lm.items():\n for name in names:\n self.lang_map[name] = code\n\n self.series_pat = re.compile(\n r'''\n \\|\\s* # Prefix\n (Series)\\s*:\\s* # Series declaration\n (?P<series>.+?)\\s+ # The series name\n \\((Book)\\s* # Book declaration\n (?P<index>[0-9.]+) # Series index\n \\s*\\)\n ''', re.X)\n\n def delocalize_datestr(self, raw):\n if self.domain == 'cn':\n return raw.replace('年', '-').replace('月', '-').replace('日', '')\n if not self.months:\n return raw\n ans = raw.lower()\n for i, vals in self.months.items():\n for x in vals:\n ans = ans.replace(x, self.english_months[i])\n ans = ans.replace(' de ', ' ')\n return ans\n\n def run(self):\n try:\n self.get_details()\n except:\n self.log.exception('get_details failed for url: %r' % self.url)\n\n def get_details(self):\n if self.preparsed_root is None:\n raw, root, selector = parse_details_page(\n self.url, self.log, self.timeout, self.browser, self.domain)\n else:\n raw, root, selector = self.preparsed_root\n\n from css_selectors import Select\n self.selector = Select(root)\n self.parse_details(raw, root)\n\n def parse_details(self, raw, root):\n asin = parse_asin(root, self.log, self.url)\n if not asin and root.xpath('//form[@action=\"/errors/validateCaptcha\"]'):\n raise CaptchaError(\n 'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')\n if self.testing:\n import tempfile\n import uuid\n with tempfile.NamedTemporaryFile(prefix=(asin or type('')(uuid.uuid4())) + '_',\n suffix='.html', delete=False) as f:\n f.write(raw)\n print('Downloaded HTML for', asin, 'saved in', f.name)\n\n try:\n title = self.parse_title(root)\n except:\n self.log.exception('Error parsing title for url: %r' % self.url)\n title = None\n\n try:\n authors = self.parse_authors(root)\n except:\n self.log.exception('Error parsing authors for url: %r' % self.url)\n authors = []\n\n if not title or not authors or not asin:\n self.log.error(\n 'Could not find title/authors/asin for %r' % self.url)\n self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title,\n authors))\n return\n\n mi = Metadata(title, authors)\n idtype = 'amazon' if self.domain == 'com' else 'amazon_' + self.domain\n mi.set_identifier(idtype, asin)\n self.amazon_id = asin\n\n try:\n mi.rating = self.parse_rating(root)\n except:\n self.log.exception('Error parsing ratings for url: %r' % self.url)\n\n try:\n mi.comments = self.parse_comments(root, raw)\n except:\n self.log.exception('Error parsing comments for url: %r' % self.url)\n\n try:\n series, series_index = self.parse_series(root)\n if series:\n mi.series, mi.series_index = series, series_index\n elif self.testing:\n mi.series, mi.series_index = 'Dummy series for testing', 1\n except:\n self.log.exception('Error parsing series for url: %r' % self.url)\n\n try:\n mi.tags = self.parse_tags(root)\n except:\n self.log.exception('Error parsing tags for url: %r' % self.url)\n\n try:\n self.cover_url = self.parse_cover(root, raw)\n except:\n self.log.exception('Error parsing cover for url: %r' % self.url)\n if self.cover_url_processor is not None and self.cover_url and self.cover_url.startswith('/'):\n self.cover_url = self.cover_url_processor(self.cover_url)\n mi.has_cover = bool(self.cover_url)\n\n detail_bullets = root.xpath('//*[@data-feature-name=\"detailBullets\"]')\n non_hero = tuple(self.selector(\n 'div#bookDetails_container_div div#nonHeroSection')) or tuple(self.selector(\n '#productDetails_techSpec_sections'))\n feature_and_detail_bullets = root.xpath('//*[@data-feature-name=\"featureBulletsAndDetailBullets\"]')\n if detail_bullets:\n self.parse_detail_bullets(root, mi, detail_bullets[0])\n elif non_hero:\n try:\n self.parse_new_details(root, mi, non_hero[0])\n except:\n self.log.exception(\n 'Failed to parse new-style book details section')\n elif feature_and_detail_bullets:\n self.parse_detail_bullets(root, mi, feature_and_detail_bullets[0], ul_selector='ul')\n\n else:\n pd = root.xpath(self.pd_xpath)\n if pd:\n pd = pd[0]\n\n try:\n isbn = self.parse_isbn(pd)\n if isbn:\n self.isbn = mi.isbn = isbn\n except:\n self.log.exception(\n 'Error parsing ISBN for url: %r' % self.url)\n\n try:\n mi.publisher = self.parse_publisher(pd)\n except:\n self.log.exception(\n 'Error parsing publisher for url: %r' % self.url)\n\n try:\n mi.pubdate = self.parse_pubdate(pd)\n except:\n self.log.exception(\n 'Error parsing publish date for url: %r' % self.url)\n\n try:\n lang = self.parse_language(pd)\n if lang:\n mi.language = lang\n except:\n self.log.exception(\n 'Error parsing language for url: %r' % self.url)\n\n else:\n self.log.warning(\n 'Failed to find product description for url: %r' % self.url)\n\n mi.source_relevance = self.relevance\n\n if self.amazon_id:\n if self.isbn:\n self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id)\n if self.cover_url:\n self.plugin.cache_identifier_to_cover_url(self.amazon_id,\n self.cover_url)\n\n self.plugin.clean_downloaded_metadata(mi)\n\n if self.filter_result(mi, self.log):\n self.result_queue.put(mi)\n\n def totext(self, elem, only_printable=False):\n res = self.tostring(elem, encoding='unicode', method='text')\n if only_printable:\n try:\n filtered_characters = list(s for s in res if s.isprintable())\n except AttributeError:\n filtered_characters = list(s for s in res if s in string.printable)\n res = ''.join(filtered_characters)\n return res.strip()\n\n def parse_title(self, root):\n\n def sanitize_title(title):\n ans = title.strip()\n if not ans.startswith('['):\n ans = re.sub(r'[(\\[].*[)\\]]', '', title).strip()\n return ans\n\n h1 = root.xpath('//h1[@id=\"title\"]')\n if h1:\n h1 = h1[0]\n for child in h1.xpath('./*[contains(@class, \"a-color-secondary\")]'):\n h1.remove(child)\n return sanitize_title(self.totext(h1))\n # audiobooks\n elem = root.xpath('//*[@id=\"productTitle\"]')\n if elem:\n return sanitize_title(self.totext(elem[0]))\n tdiv = root.xpath('//h1[contains(@class, \"parseasinTitle\")]')\n if not tdiv:\n span = root.xpath('//*[@id=\"ebooksTitle\"]')\n if span:\n return sanitize_title(self.totext(span[0]))\n h1 = root.xpath('//h1[@data-feature-name=\"title\"]')\n if h1:\n return sanitize_title(self.totext(h1[0]))\n raise ValueError('No title block found')\n tdiv = tdiv[0]\n actual_title = tdiv.xpath('descendant::*[@id=\"btAsinTitle\"]')\n if actual_title:\n title = self.tostring(actual_title[0], encoding='unicode',\n method='text').strip()\n else:\n title = self.tostring(tdiv, encoding='unicode',\n method='text').strip()\n return sanitize_title(title)\n\n def parse_authors(self, root):\n for sel in (\n '#byline .author .contributorNameID',\n '#byline .author a.a-link-normal',\n '#bylineInfo .author .contributorNameID',\n '#bylineInfo .author a.a-link-normal',\n '#bylineInfo #bylineContributor',\n '#bylineInfo #contributorLink',\n ):\n matches = tuple(self.selector(sel))\n if matches:\n authors = [self.totext(x) for x in matches]\n return [a for a in authors if a]\n\n x = '//h1[contains(@class, \"parseasinTitle\")]/following-sibling::span/*[(name()=\"a\" and @href) or (name()=\"span\" and @class=\"contributorNameTrigger\")]'\n aname = root.xpath(x)\n if not aname:\n aname = root.xpath('''\n //h1[contains(@class, \"parseasinTitle\")]/following-sibling::*[(name()=\"a\" and @href) or (name()=\"span\" and @class=\"contributorNameTrigger\")]\n ''')\n for x in aname:\n x.tail = ''\n authors = [self.tostring(x, encoding='unicode', method='text').strip() for x\n in aname]\n authors = [a for a in authors if a]\n return authors\n\n def parse_rating(self, root):\n for x in root.xpath('//div[@id=\"cpsims-feature\" or @id=\"purchase-sims-feature\" or @id=\"rhf\"]'):\n # Remove the similar books section as it can cause spurious\n # ratings matches\n x.getparent().remove(x)\n\n rating_paths = (\n '//div[@data-feature-name=\"averageCustomerReviews\" or @id=\"averageCustomerReviews\"]',\n '//div[@class=\"jumpBar\"]/descendant::span[contains(@class,\"asinReviewsSummary\")]',\n '//div[@class=\"buying\"]/descendant::span[contains(@class,\"asinReviewsSummary\")]',\n '//span[@class=\"crAvgStars\"]/descendant::span[contains(@class,\"asinReviewsSummary\")]'\n )\n ratings = None\n for p in rating_paths:\n ratings = root.xpath(p)\n if ratings:\n break\n\n def parse_ratings_text(text):\n try:\n m = self.ratings_pat.match(text)\n return float(m.group(1).replace(',', '.')) / float(m.group(3)) * 5\n except Exception:\n pass\n\n if ratings:\n ratings = ratings[0]\n for elem in ratings.xpath('descendant::*[@title]'):\n t = elem.get('title').strip()\n if self.domain == 'cn':\n m = self.ratings_pat_cn.match(t)\n if m is not None:\n return float(m.group(1))\n elif self.domain == 'jp':\n m = self.ratings_pat_jp.match(t)\n if m is not None:\n return float(m.group(1))\n else:\n ans = parse_ratings_text(t)\n if ans is not None:\n return ans\n for elem in ratings.xpath('descendant::span[@class=\"a-icon-alt\"]'):\n t = self.tostring(\n elem, encoding='unicode', method='text', with_tail=False).strip()\n ans = parse_ratings_text(t)\n if ans is not None:\n return ans\n else:\n # found in kindle book pages on amazon.com\n for x in root.xpath('//a[@id=\"acrCustomerReviewLink\"]'):\n spans = x.xpath('./span')\n if spans:\n txt = self.tostring(spans[0], method='text', encoding='unicode', with_tail=False).strip()\n try:\n return float(txt.replace(',', '.'))\n except Exception:\n pass\n\n def _render_comments(self, desc):\n from calibre.library.comments import sanitize_comments_html\n\n for c in desc.xpath('descendant::noscript'):\n c.getparent().remove(c)\n for c in desc.xpath('descendant::*[@class=\"seeAll\" or'\n ' @class=\"emptyClear\" or @id=\"collapsePS\" or'\n ' @id=\"expandPS\"]'):\n c.getparent().remove(c)\n for b in desc.xpath('descendant::b[@style]'):\n # Bing highlights search results\n s = b.get('style', '')\n if 'color' in s:\n b.tag = 'span'\n del b.attrib['style']\n\n for a in desc.xpath('descendant::a[@href]'):\n del a.attrib['href']\n a.tag = 'span'\n for a in desc.xpath('descendant::span[@class=\"a-text-italic\"]'):\n a.tag = 'i'\n for a in desc.xpath('descendant::span[@class=\"a-text-bold\"]'):\n a.tag = 'b'\n desc = self.tostring(desc, method='html', encoding='unicode').strip()\n desc = xml_replace_entities(desc, 'utf-8')\n\n # Encoding bug in Amazon data U+fffd (replacement char)\n # in some examples it is present in place of '\n desc = desc.replace('\\ufffd', \"'\")\n # remove all attributes from tags\n desc = re.sub(r'<([a-zA-Z0-9]+)\\s[^>]+>', r'<\\1>', desc)\n # Collapse whitespace\n # desc = re.sub('\\n+', '\\n', desc)\n # desc = re.sub(' +', ' ', desc)\n # Remove the notice about text referring to out of print editions\n desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)\n # Remove comments\n desc = re.sub(r'(?s)<!--.*?-->', '', desc)\n return sanitize_comments_html(desc)\n\n def parse_comments(self, root, raw):\n try:\n from urllib.parse import unquote\n except ImportError:\n from urllib import unquote\n ans = ''\n ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionOverviewContent_feature_div')) or tuple(\n self.selector('#drengr_DesktopTabbedDescriptionOverviewContent_feature_div'))\n if ovr:\n ovr = ovr[0]\n ovr.tag = 'div'\n ans = self._render_comments(ovr)\n ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionEditorialsContent_feature_div')) or tuple(\n self.selector('#drengr_DesktopTabbedDescriptionEditorialsContent_feature_div'))\n if ovr:\n ovr = ovr[0]\n ovr.tag = 'div'\n ans += self._render_comments(ovr)\n else:\n ns = tuple(self.selector('#bookDescription_feature_div noscript'))\n if ns:\n ns = ns[0]\n if len(ns) == 0 and ns.text:\n import html5lib\n\n # html5lib parsed noscript as CDATA\n ns = html5lib.parseFragment(\n '<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]\n else:\n ns.tag = 'div'\n ans = self._render_comments(ns)\n else:\n desc = root.xpath('//div[@id=\"ps-content\"]/div[@class=\"content\"]')\n if desc:\n ans = self._render_comments(desc[0])\n else:\n ns = tuple(self.selector('#bookDescription_feature_div .a-expander-content'))\n if ns:\n ans = self._render_comments(ns[0])\n # audiobooks\n if not ans:\n elem = root.xpath('//*[@id=\"audible_desktopTabbedDescriptionOverviewContent_feature_div\"]')\n if elem:\n ans = self._render_comments(elem[0])\n desc = root.xpath(\n '//div[@id=\"productDescription\"]/*[@class=\"content\"]')\n if desc:\n ans += self._render_comments(desc[0])\n else:\n # Idiot chickens from amazon strike again. This data is now stored\n # in a JS variable inside a script tag URL encoded.\n m = re.search(br'var\\s+iframeContent\\s*=\\s*\"([^\"]+)\"', raw)\n if m is not None:\n try:\n text = unquote(m.group(1)).decode('utf-8')\n nr = parse_html(text)\n desc = nr.xpath(\n '//div[@id=\"productDescription\"]/*[@class=\"content\"]')\n if desc:\n ans += self._render_comments(desc[0])\n except Exception as e:\n self.log.warn(\n 'Parsing of obfuscated product description failed with error: %s' % as_unicode(e))\n else:\n desc = root.xpath('//div[@id=\"productDescription_fullView\"]')\n if desc:\n ans += self._render_comments(desc[0])\n\n return ans\n\n def parse_series(self, root):\n ans = (None, None)\n\n # This is found on kindle pages for books on amazon.com\n series = root.xpath('//*[@id=\"rpi-attribute-book_details-series\"]')\n if series:\n spans = series[0].xpath('descendant::span')\n if spans:\n texts = [self.tostring(x, encoding='unicode', method='text', with_tail=False).strip() for x in spans]\n texts = list(filter(None, texts))\n if len(texts) == 2:\n idxinfo, series = texts\n m = re.search(r'[0-9.]+', idxinfo.strip())\n if m is not None:\n ans = series, float(m.group())\n return ans\n\n # This is found on the paperback/hardback pages for books on amazon.com\n series = root.xpath('//div[@data-feature-name=\"seriesTitle\"]')\n if series:\n series = series[0]\n spans = series.xpath('./span')\n if spans:\n raw = self.tostring(\n spans[0], encoding='unicode', method='text', with_tail=False).strip()\n m = re.search(r'\\s+([0-9.]+)$', raw.strip())\n if m is not None:\n series_index = float(m.group(1))\n s = series.xpath('./a[@id=\"series-page-link\"]')\n if s:\n series = self.tostring(\n s[0], encoding='unicode', method='text', with_tail=False).strip()\n if series:\n ans = (series, series_index)\n else:\n series = root.xpath('//div[@id=\"seriesBulletWidget_feature_div\"]')\n if series:\n a = series[0].xpath('descendant::a')\n if a:\n raw = self.tostring(a[0], encoding='unicode', method='text', with_tail=False)\n if self.domain == 'jp':\n m = re.search(r'(?P<index>[0-9.]+)\\s*(?:巻|冊)\\s*\\(全\\s*([0-9.]+)\\s*(?:巻|冊)\\):\\s*(?P<series>.+)', raw.strip())\n else:\n m = re.search(r'(?:Book|Libro|Buch)\\s+(?P<index>[0-9.]+)\\s+(?:of|de|von)\\s+([0-9.]+)\\s*:\\s*(?P<series>.+)', raw.strip())\n if m is not None:\n ans = (m.group('series').strip(), float(m.group('index')))\n\n # This is found on Kindle edition pages on amazon.com\n if ans == (None, None):\n for span in root.xpath('//div[@id=\"aboutEbooksSection\"]//li/span'):\n text = (span.text or '').strip()\n m = re.match(r'Book\\s+([0-9.]+)', text)\n if m is not None:\n series_index = float(m.group(1))\n a = span.xpath('./a[@href]')\n if a:\n series = self.tostring(\n a[0], encoding='unicode', method='text', with_tail=False).strip()\n if series:\n ans = (series, series_index)\n # This is found on newer Kindle edition pages on amazon.com\n if ans == (None, None):\n for b in root.xpath('//div[@id=\"reviewFeatureGroup\"]/span/b'):\n text = (b.text or '').strip()\n m = re.match(r'Book\\s+([0-9.]+)', text)\n if m is not None:\n series_index = float(m.group(1))\n a = b.getparent().xpath('./a[@href]')\n if a:\n series = self.tostring(\n a[0], encoding='unicode', method='text', with_tail=False).partition('(')[0].strip()\n if series:\n ans = series, series_index\n\n if ans == (None, None):\n desc = root.xpath('//div[@id=\"ps-content\"]/div[@class=\"buying\"]')\n if desc:\n raw = self.tostring(desc[0], method='text', encoding='unicode')\n raw = re.sub(r'\\s+', ' ', raw)\n match = self.series_pat.search(raw)\n if match is not None:\n s, i = match.group('series'), float(match.group('index'))\n if s:\n ans = (s, i)\n if ans[0]:\n ans = (re.sub(r'\\s+Series$', '', ans[0]).strip(), ans[1])\n ans = (re.sub(r'\\(.+?\\s+Series\\)$', '', ans[0]).strip(), ans[1])\n return ans\n\n def parse_tags(self, root):\n ans = []\n exclude_tokens = {'kindle', 'a-z'}\n exclude = {'special features', 'by authors',\n 'authors & illustrators', 'books', 'new; used & rental textbooks'}\n seen = set()\n for li in root.xpath(self.tags_xpath):\n for i, a in enumerate(li.iterdescendants('a')):\n if i > 0:\n # we ignore the first category since it is almost always\n # too broad\n raw = (a.text or '').strip().replace(',', ';')\n lraw = icu_lower(raw)\n tokens = frozenset(lraw.split())\n if raw and lraw not in exclude and not tokens.intersection(exclude_tokens) and lraw not in seen:\n ans.append(raw)\n seen.add(lraw)\n return ans\n\n def parse_cover(self, root, raw=b\"\"):\n # Look for the image URL in javascript, using the first image in the\n # image gallery as the cover\n import json\n imgpat = re.compile(r'\"hiRes\":\"(.+?)\",\"thumb\"')\n for script in root.xpath('//script'):\n m = imgpat.search(script.text or '')\n if m is not None:\n return m.group(1)\n imgpat = re.compile(r\"\"\"'imageGalleryData'\\s*:\\s*(\\[\\s*{.+])\"\"\")\n for script in root.xpath('//script'):\n m = imgpat.search(script.text or '')\n if m is not None:\n try:\n return json.loads(m.group(1))[0]['mainUrl']\n except Exception:\n continue\n\n def clean_img_src(src):\n parts = src.split('/')\n if len(parts) > 3:\n bn = parts[-1]\n sparts = bn.split('_')\n if len(sparts) > 2:\n bn = re.sub(r'\\.\\.jpg$', '.jpg', (sparts[0] + sparts[-1]))\n return ('/'.join(parts[:-1])) + '/' + bn\n\n imgpat2 = re.compile(r'var imageSrc = \"([^\"]+)\"')\n for script in root.xpath('//script'):\n m = imgpat2.search(script.text or '')\n if m is not None:\n src = m.group(1)\n url = clean_img_src(src)\n if url:\n return url\n\n imgs = root.xpath(\n '//img[(@id=\"prodImage\" or @id=\"original-main-image\" or @id=\"main-image\" or @id=\"main-image-nonjs\") and @src]')\n if not imgs:\n imgs = (\n root.xpath('//div[@class=\"main-image-inner-wrapper\"]/img[@src]') or\n root.xpath('//div[@id=\"main-image-container\" or @id=\"ebooks-main-image-container\"]//img[@src]') or\n root.xpath(\n '//div[@id=\"mainImageContainer\"]//img[@data-a-dynamic-image]')\n )\n for img in imgs:\n try:\n idata = json.loads(img.get('data-a-dynamic-image'))\n except Exception:\n imgs = ()\n else:\n mwidth = 0\n try:\n url = None\n for iurl, (width, height) in idata.items():\n if width > mwidth:\n mwidth = width\n url = iurl\n\n return url\n except Exception:\n pass\n\n for img in imgs:\n src = img.get('src')\n if 'data:' in src:\n continue\n if 'loading-' in src:\n js_img = re.search(br'\"largeImage\":\"(https?://[^\"]+)\",', raw)\n if js_img:\n src = js_img.group(1).decode('utf-8')\n if ('/no-image-avail' not in src and 'loading-' not in src and '/no-img-sm' not in src):\n self.log('Found image: %s' % src)\n url = clean_img_src(src)\n if url:\n return url\n\n def parse_detail_bullets(self, root, mi, container, ul_selector='.detail-bullet-list'):\n try:\n ul = next(self.selector(ul_selector, root=container))\n except StopIteration:\n return\n for span in self.selector('.a-list-item', root=ul):\n cells = span.xpath('./span')\n if len(cells) >= 2:\n self.parse_detail_cells(mi, cells[0], cells[1])\n\n def parse_new_details(self, root, mi, non_hero):\n table = non_hero.xpath('descendant::table')[0]\n for tr in table.xpath('descendant::tr'):\n cells = tr.xpath('descendant::*[local-name()=\"td\" or local-name()=\"th\"]')\n if len(cells) == 2:\n self.parse_detail_cells(mi, cells[0], cells[1])\n\n def parse_detail_cells(self, mi, c1, c2):\n name = self.totext(c1, only_printable=True).strip().strip(':').strip()\n val = self.totext(c2)\n val = val.replace('\\u200e', '').replace('\\u200f', '')\n if not val:\n return\n if name in self.language_names:\n ans = self.lang_map.get(val)\n if not ans:\n ans = canonicalize_lang(val)\n if ans:\n mi.language = ans\n elif name in self.publisher_names:\n pub = val.partition(';')[0].partition('(')[0].strip()\n if pub:\n mi.publisher = pub\n date = val.rpartition('(')[-1].replace(')', '').strip()\n try:\n from calibre.utils.date import parse_only_date\n date = self.delocalize_datestr(date)\n mi.pubdate = parse_only_date(date, assume_utc=True)\n except:\n self.log.exception('Failed to parse pubdate: %s' % val)\n elif name in {'ISBN', 'ISBN-10', 'ISBN-13'}:\n ans = check_isbn(val)\n if ans:\n self.isbn = mi.isbn = ans\n elif name in {'Publication date'}:\n from calibre.utils.date import parse_only_date\n date = self.delocalize_datestr(val)\n mi.pubdate = parse_only_date(date, assume_utc=True)\n\n def parse_isbn(self, pd):\n items = pd.xpath(\n 'descendant::*[starts-with(text(), \"ISBN\")]')\n if not items:\n items = pd.xpath(\n 'descendant::b[contains(text(), \"ISBN:\")]')\n for x in reversed(items):\n if x.tail:\n ans = check_isbn(x.tail.strip())\n if ans:\n return ans\n\n def parse_publisher(self, pd):\n for x in reversed(pd.xpath(self.publisher_xpath)):\n if x.tail:\n ans = x.tail.partition(';')[0]\n return ans.partition('(')[0].strip()\n\n def parse_pubdate(self, pd):\n from calibre.utils.date import parse_only_date\n for x in reversed(pd.xpath(self.pubdate_xpath)):\n if x.tail:\n date = x.tail.strip()\n date = self.delocalize_datestr(date)\n try:\n return parse_only_date(date, assume_utc=True)\n except Exception:\n pass\n for x in reversed(pd.xpath(self.publisher_xpath)):\n if x.tail:\n ans = x.tail\n date = ans.rpartition('(')[-1].replace(')', '').strip()\n date = self.delocalize_datestr(date)\n try:\n return parse_only_date(date, assume_utc=True)\n except Exception:\n pass\n\n def parse_language(self, pd):\n for x in reversed(pd.xpath(self.language_xpath)):\n if x.tail:\n raw = x.tail.strip().partition(',')[0].strip()\n ans = self.lang_map.get(raw, None)\n if ans:\n return ans\n ans = canonicalize_lang(ans)\n if ans:\n return ans\n# }}}\n\n\nclass Amazon(Source):\n\n name = 'Amazon.com'\n version = (1, 3, 9)\n minimum_calibre_version = (2, 82, 0)\n description = _('Downloads metadata and covers from Amazon')\n\n capabilities = frozenset(('identify', 'cover'))\n touched_fields = frozenset(('title', 'authors', 'identifier:amazon',\n 'rating', 'comments', 'publisher', 'pubdate',\n 'languages', 'series', 'tags'))\n has_html_comments = True\n supports_gzip_transfer_encoding = True\n prefer_results_with_isbn = False\n\n AMAZON_DOMAINS = {\n 'com': _('US'),\n 'fr': _('France'),\n 'de': _('Germany'),\n 'uk': _('UK'),\n 'au': _('Australia'),\n 'it': _('Italy'),\n 'jp': _('Japan'),\n 'es': _('Spain'),\n 'br': _('Brazil'),\n 'in': _('India'),\n 'nl': _('Netherlands'),\n 'cn': _('China'),\n 'ca': _('Canada'),\n 'se': _('Sweden'),\n }\n\n SERVERS = {\n 'auto': _('Choose server automatically'),\n 'amazon': _('Amazon servers'),\n 'bing': _('Bing search cache'),\n 'google': _('Google search cache'),\n 'wayback': _('Wayback machine cache (slow)'),\n 'ddg': _('DuckDuckGo search and Google cache'),\n }\n\n options = (\n Option('domain', 'choices', 'com', _('Amazon country website to use:'),\n _('Metadata from Amazon will be fetched using this '\n 'country\\'s Amazon website.'), choices=AMAZON_DOMAINS),\n Option('server', 'choices', 'auto', _('Server to get data from:'),\n _(\n 'Amazon has started blocking attempts to download'\n ' metadata from its servers. To get around this problem,'\n ' calibre can fetch the Amazon data from many different'\n ' places where it is cached. Choose the source you prefer.'\n ), choices=SERVERS),\n Option('use_mobi_asin', 'bool', False, _('Use the MOBI-ASIN for metadata search'),\n _(\n 'Enable this option to search for metadata with an'\n ' ASIN identifier from the MOBI file at the current country website,'\n ' unless any other amazon id is available. Note that if the'\n ' MOBI file came from a different Amazon country store, you could get'\n ' incorrect results.'\n )),\n Option('prefer_kindle_edition', 'bool', False, _('Prefer the Kindle edition, when available'),\n _(\n 'When searching for a book and the search engine returns both paper and Kindle editions,'\n ' always prefer the Kindle edition, instead of whatever the search engine returns at the'\n ' top.')\n ),\n )\n\n def __init__(self, *args, **kwargs):\n Source.__init__(self, *args, **kwargs)\n self.set_amazon_id_touched_fields()\n\n def id_from_url(self, url):\n from polyglot.urllib import urlparse\n purl = urlparse(url)\n if purl.netloc and purl.path and '/dp/' in purl.path:\n host_parts = tuple(x.lower() for x in purl.netloc.split('.'))\n if 'amazon' in host_parts:\n domain = host_parts[-1]\n parts = purl.path.split('/')\n idx = parts.index('dp')\n try:\n val = parts[idx+1]\n except IndexError:\n return\n aid = 'amazon' if domain == 'com' else ('amazon_' + domain)\n return aid, val\n\n def test_fields(self, mi):\n '''\n Return the first field from self.touched_fields that is null on the\n mi object\n '''\n for key in self.touched_fields:\n if key.startswith('identifier:'):\n key = key.partition(':')[-1]\n if key == 'amazon':\n if self.domain != 'com':\n key += '_' + self.domain\n if not mi.has_identifier(key):\n return 'identifier: ' + key\n elif mi.is_null(key):\n return key\n\n @property\n def browser(self):\n br = self._browser\n if br is None:\n ua = 'Mobile '\n while not user_agent_is_ok(ua):\n ua = random_user_agent(allow_ie=False)\n # ua = 'Mozilla/5.0 (Linux; Android 8.0.0; VTR-L29; rv:63.0) Gecko/20100101 Firefox/63.0'\n self._browser = br = browser(user_agent=ua)\n br.set_handle_gzip(True)\n if self.use_search_engine:\n br.addheaders += [\n ('Accept', accept_header_for_ua(ua)),\n ('Upgrade-insecure-requests', '1'),\n ]\n else:\n br.addheaders += [\n ('Accept', accept_header_for_ua(ua)),\n ('Upgrade-insecure-requests', '1'),\n ('Referer', self.referrer_for_domain()),\n ]\n return br\n\n def save_settings(self, *args, **kwargs):\n Source.save_settings(self, *args, **kwargs)\n self.set_amazon_id_touched_fields()\n\n def set_amazon_id_touched_fields(self):\n ident_name = \"identifier:amazon\"\n if self.domain != 'com':\n ident_name += '_' + self.domain\n tf = [x for x in self.touched_fields if not\n x.startswith('identifier:amazon')] + [ident_name]\n self.touched_fields = frozenset(tf)\n\n def get_domain_and_asin(self, identifiers, extra_domains=()):\n identifiers = {k.lower(): v for k, v in identifiers.items()}\n for key, val in identifiers.items():\n if key in ('amazon', 'asin'):\n return 'com', val\n if key.startswith('amazon_'):\n domain = key.partition('_')[-1]\n if domain and (domain in self.AMAZON_DOMAINS or domain in extra_domains):\n return domain, val\n if self.prefs['use_mobi_asin']:\n val = identifiers.get('mobi-asin')\n if val is not None:\n return self.domain, val\n return None, None\n\n def referrer_for_domain(self, domain=None):\n domain = domain or self.domain\n return {\n 'uk': 'https://www.amazon.co.uk/',\n 'au': 'https://www.amazon.com.au/',\n 'br': 'https://www.amazon.com.br/',\n 'jp': 'https://www.amazon.co.jp/',\n 'mx': 'https://www.amazon.com.mx/',\n }.get(domain, 'https://www.amazon.%s/' % domain)\n\n def _get_book_url(self, identifiers): # {{{\n domain, asin = self.get_domain_and_asin(\n identifiers, extra_domains=('au', 'ca'))\n if domain and asin:\n url = None\n r = self.referrer_for_domain(domain)\n if r is not None:\n url = r + 'dp/' + asin\n if url:\n idtype = 'amazon' if domain == 'com' else 'amazon_' + domain\n return domain, idtype, asin, url\n\n def get_book_url(self, identifiers):\n ans = self._get_book_url(identifiers)\n if ans is not None:\n return ans[1:]\n\n def get_book_url_name(self, idtype, idval, url):\n if idtype == 'amazon':\n return self.name\n return 'A' + idtype.replace('_', '.')[1:]\n # }}}\n\n @property\n def domain(self):\n x = getattr(self, 'testing_domain', None)\n if x is not None:\n return x\n domain = self.prefs['domain']\n if domain not in self.AMAZON_DOMAINS:\n domain = 'com'\n\n return domain\n\n @property\n def server(self):\n x = getattr(self, 'testing_server', None)\n if x is not None:\n return x\n server = self.prefs['server']\n if server not in self.SERVERS:\n server = 'auto'\n return server\n\n @property\n def use_search_engine(self):\n return self.server != 'amazon'\n\n def clean_downloaded_metadata(self, mi):\n docase = (\n mi.language == 'eng' or\n (mi.is_null('language') and self.domain in {'com', 'uk', 'au'})\n )\n if mi.title and docase:\n # Remove series information from title\n m = re.search(r'\\S+\\s+(\\(.+?\\s+Book\\s+\\d+\\))$', mi.title)\n if m is not None:\n mi.title = mi.title.replace(m.group(1), '').strip()\n mi.title = fixcase(mi.title)\n mi.authors = fixauthors(mi.authors)\n if mi.tags and docase:\n mi.tags = list(map(fixcase, mi.tags))\n mi.isbn = check_isbn(mi.isbn)\n if mi.series and docase:\n mi.series = fixcase(mi.series)\n if mi.title and mi.series:\n for pat in (r':\\s*Book\\s+\\d+\\s+of\\s+%s$', r'\\(%s\\)$', r':\\s*%s\\s+Book\\s+\\d+$'):\n pat = pat % re.escape(mi.series)\n q = re.sub(pat, '', mi.title, flags=re.I).strip()\n if q and q != mi.title:\n mi.title = q\n break\n\n def get_website_domain(self, domain):\n return {'uk': 'co.uk', 'jp': 'co.jp', 'br': 'com.br', 'au': 'com.au'}.get(domain, domain)\n\n def create_query(self, log, title=None, authors=None, identifiers={}, # {{{\n domain=None, for_amazon=True):\n try:\n from urllib.parse import unquote_plus, urlencode\n except ImportError:\n from urllib import unquote_plus, urlencode\n if domain is None:\n domain = self.domain\n\n idomain, asin = self.get_domain_and_asin(identifiers)\n if idomain is not None:\n domain = idomain\n\n # See the amazon detailed search page to get all options\n terms = []\n q = {'search-alias': 'aps',\n 'unfiltered': '1',\n }\n\n if domain == 'com':\n q['sort'] = 'relevanceexprank'\n else:\n q['sort'] = 'relevancerank'\n\n isbn = check_isbn(identifiers.get('isbn', None))\n\n if asin is not None:\n q['field-keywords'] = asin\n terms.append(asin)\n elif isbn is not None:\n q['field-isbn'] = isbn\n if len(isbn) == 13:\n terms.extend('({} OR {}-{})'.format(isbn, isbn[:3], isbn[3:]).split())\n else:\n terms.append(isbn)\n else:\n # Only return book results\n q['search-alias'] = {'br': 'digital-text',\n 'nl': 'aps'}.get(domain, 'stripbooks')\n if title:\n title_tokens = list(self.get_title_tokens(title))\n if title_tokens:\n q['field-title'] = ' '.join(title_tokens)\n terms.extend(title_tokens)\n if authors:\n author_tokens = list(self.get_author_tokens(authors,\n only_first_author=True))\n if author_tokens:\n q['field-author'] = ' '.join(author_tokens)\n terms.extend(author_tokens)\n\n if not ('field-keywords' in q or 'field-isbn' in q or\n ('field-title' in q)):\n # Insufficient metadata to make an identify query\n log.error('Insufficient metadata to construct query, none of title, ISBN or ASIN supplied')\n raise SearchFailed()\n\n if not for_amazon:\n return terms, domain\n\n if domain == 'nl':\n q['__mk_nl_NL'] = 'ÅMÅŽÕÑ'\n if 'field-keywords' not in q:\n q['field-keywords'] = ''\n for f in 'field-isbn field-title field-author'.split():\n q['field-keywords'] += ' ' + q.pop(f, '')\n q['field-keywords'] = q['field-keywords'].strip()\n\n encoded_q = dict([(x.encode('utf-8', 'ignore'), y.encode(\n 'utf-8', 'ignore')) for x, y in q.items()])\n url_query = urlencode(encoded_q)\n # amazon's servers want IRIs with unicode characters not percent esaped\n parts = []\n for x in url_query.split(b'&' if isinstance(url_query, bytes) else '&'):\n k, v = x.split(b'=' if isinstance(x, bytes) else '=', 1)\n parts.append('{}={}'.format(iri_quote_plus(unquote_plus(k)), iri_quote_plus(unquote_plus(v))))\n url_query = '&'.join(parts)\n url = 'https://www.amazon.%s/s/?' % self.get_website_domain(\n domain) + url_query\n return url, domain\n\n # }}}\n\n def get_cached_cover_url(self, identifiers): # {{{\n url = None\n domain, asin = self.get_domain_and_asin(identifiers)\n if asin is None:\n isbn = identifiers.get('isbn', None)\n if isbn is not None:\n asin = self.cached_isbn_to_identifier(isbn)\n if asin is not None:\n url = self.cached_identifier_to_cover_url(asin)\n\n return url\n # }}}\n\n def parse_results_page(self, root, domain): # {{{\n from lxml.html import tostring\n\n matches = []\n\n def title_ok(title):\n title = title.lower()\n bad = ['bulk pack', '[audiobook]', '[audio cd]',\n '(a book companion)', '( slipcase with door )', ': free sampler']\n if self.domain == 'com':\n bad.extend(['(%s edition)' % x for x in ('spanish', 'german')])\n for x in bad:\n if x in title:\n return False\n if title and title[0] in '[{' and re.search(r'\\(\\s*author\\s*\\)', title) is not None:\n # Bad entries in the catalog\n return False\n return True\n\n for query in (\n '//div[contains(@class, \"s-result-list\")]//h2/a[@href]',\n '//div[contains(@class, \"s-result-list\")]//div[@data-index]//h5//a[@href]',\n r'//li[starts-with(@id, \"result_\")]//a[@href and contains(@class, \"s-access-detail-page\")]',\n ):\n result_links = root.xpath(query)\n if result_links:\n break\n for a in result_links:\n title = tostring(a, method='text', encoding='unicode')\n if title_ok(title):\n url = a.get('href')\n if url.startswith('/'):\n url = 'https://www.amazon.%s%s' % (\n self.get_website_domain(domain), url)\n matches.append(url)\n\n if not matches:\n # Previous generation of results page markup\n for div in root.xpath(r'//div[starts-with(@id, \"result_\")]'):\n links = div.xpath(r'descendant::a[@class=\"title\" and @href]')\n if not links:\n # New amazon markup\n links = div.xpath('descendant::h3/a[@href]')\n for a in links:\n title = tostring(a, method='text', encoding='unicode')\n if title_ok(title):\n url = a.get('href')\n if url.startswith('/'):\n url = 'https://www.amazon.%s%s' % (\n self.get_website_domain(domain), url)\n matches.append(url)\n break\n\n if not matches:\n # This can happen for some user agents that Amazon thinks are\n # mobile/less capable\n for td in root.xpath(\n r'//div[@id=\"Results\"]/descendant::td[starts-with(@id, \"search:Td:\")]'):\n for a in td.xpath(r'descendant::td[@class=\"dataColumn\"]/descendant::a[@href]/span[@class=\"srTitle\"]/..'):\n title = tostring(a, method='text', encoding='unicode')\n if title_ok(title):\n url = a.get('href')\n if url.startswith('/'):\n url = 'https://www.amazon.%s%s' % (\n self.get_website_domain(domain), url)\n matches.append(url)\n break\n if not matches and root.xpath('//form[@action=\"/errors/validateCaptcha\"]'):\n raise CaptchaError('Amazon returned a CAPTCHA page. Recently Amazon has begun using statistical'\n ' profiling to block access to its website. As such this metadata plugin is'\n ' unlikely to ever work reliably.')\n\n # Keep only the top 3 matches as the matches are sorted by relevance by\n # Amazon so lower matches are not likely to be very relevant\n return matches[:3]\n # }}}\n\n def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{\n from calibre.ebooks.chardet import xml_to_unicode\n from calibre.utils.cleantext import clean_ascii_chars\n matches = []\n query, domain = self.create_query(log, title=title, authors=authors,\n identifiers=identifiers)\n time.sleep(1)\n try:\n raw = br.open_novisit(query, timeout=timeout).read().strip()\n except Exception as e:\n if callable(getattr(e, 'getcode', None)) and \\\n e.getcode() == 404:\n log.error('Query malformed: %r' % query)\n raise SearchFailed()\n attr = getattr(e, 'args', [None])\n attr = attr if attr else [None]\n if isinstance(attr[0], socket.timeout):\n msg = _('Amazon timed out. Try again later.')\n log.error(msg)\n else:\n msg = 'Failed to make identify query: %r' % query\n log.exception(msg)\n raise SearchFailed()\n\n raw = clean_ascii_chars(xml_to_unicode(raw,\n strip_encoding_pats=True, resolve_entities=True)[0])\n\n if testing:\n import tempfile\n with tempfile.NamedTemporaryFile(prefix='amazon_results_',\n suffix='.html', delete=False) as f:\n f.write(raw.encode('utf-8'))\n print('Downloaded html for results page saved in', f.name)\n\n matches = []\n found = '<title>404 - ' not in raw\n\n if found:\n try:\n root = parse_html(raw)\n except Exception:\n msg = 'Failed to parse amazon page for query: %r' % query\n log.exception(msg)\n raise SearchFailed()\n\n matches = self.parse_results_page(root, domain)\n\n return matches, query, domain, None\n # }}}\n\n def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout, override_server=None): # {{{\n from calibre.ebooks.metadata.sources.update import search_engines_module\n se = search_engines_module()\n terms, domain = self.create_query(log, title=title, authors=authors,\n identifiers=identifiers, for_amazon=False)\n site = self.referrer_for_domain(\n domain)[len('https://'):].partition('/')[0]\n matches = []\n server = override_server or self.server\n urlproc, sfunc = se.google_url_processor, se.google_search\n if server == 'bing':\n urlproc, sfunc = se.bing_url_processor, se.bing_search\n elif server == 'wayback':\n urlproc, sfunc = se.wayback_url_processor, se.ddg_search\n elif server == 'ddg':\n urlproc, sfunc = se.ddg_url_processor, se.ddg_search\n try:\n results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)\n except HTTPError as err:\n if err.code == 429 and sfunc is se.google_search:\n log('Got too many requests error from Google, trying via DuckDuckGo')\n urlproc, sfunc = se.ddg_url_processor, se.ddg_search\n results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)\n else:\n raise\n\n br.set_current_header('Referer', qurl)\n for result in results:\n if abort.is_set():\n return matches, terms, domain, None\n\n purl = urlparse(result.url)\n if '/dp/' in purl.path and site in purl.netloc:\n url = result.cached_url\n if url is None:\n url = se.get_cached_url(result.url, br, timeout=timeout)\n if url is None:\n log('Failed to find cached page for:', result.url)\n continue\n if url not in matches:\n matches.append(url)\n if len(matches) >= 3:\n break\n else:\n log('Skipping non-book result:', result)\n if not matches:\n log('No search engine results for terms:', ' '.join(terms))\n if urlproc is se.google_url_processor:\n # Google does not cache adult titles\n log('Trying the bing search engine instead')\n return self.search_search_engine(br, testing, log, abort, title, authors, identifiers, timeout, 'bing')\n return matches, terms, domain, urlproc\n # }}}\n\n def identify(self, log, result_queue, abort, title=None, authors=None, # {{{\n identifiers={}, timeout=60):\n '''\n Note this method will retry without identifiers automatically if no\n match is found with identifiers.\n '''\n\n testing = getattr(self, 'running_a_test', False)\n\n udata = self._get_book_url(identifiers)\n br = self.browser\n log('User-agent:', br.current_user_agent())\n log('Server:', self.server)\n if testing:\n print('User-agent:', br.current_user_agent())\n if udata is not None and not self.use_search_engine:\n # Try to directly get details page instead of running a search\n # Cannot use search engine as the directly constructed URL is\n # usually redirected to a full URL by amazon, and is therefore\n # not cached\n domain, idtype, asin, durl = udata\n if durl is not None:\n preparsed_root = parse_details_page(\n durl, log, timeout, br, domain)\n if preparsed_root is not None:\n qasin = parse_asin(preparsed_root[1], log, durl)\n if qasin == asin:\n w = Worker(durl, result_queue, br, log, 0, domain,\n self, testing=testing, preparsed_root=preparsed_root, timeout=timeout)\n try:\n w.get_details()\n return\n except Exception:\n log.exception(\n 'get_details failed for url: %r' % durl)\n func = self.search_search_engine if self.use_search_engine else self.search_amazon\n try:\n matches, query, domain, cover_url_processor = func(\n br, testing, log, abort, title, authors, identifiers, timeout)\n except SearchFailed:\n return\n\n if abort.is_set():\n return\n\n if not matches:\n if identifiers and title and authors:\n log('No matches found with identifiers, retrying using only'\n ' title and authors. Query: %r' % query)\n time.sleep(1)\n return self.identify(log, result_queue, abort, title=title,\n authors=authors, timeout=timeout)\n log.error('No matches found with query: %r' % query)\n return\n\n if self.prefs['prefer_kindle_edition']:\n matches = sort_matches_preferring_kindle_editions(matches)\n\n workers = [Worker(\n url, result_queue, br, log, i, domain, self, testing=testing, timeout=timeout,\n cover_url_processor=cover_url_processor, filter_result=partial(\n self.filter_result, title, authors, identifiers)) for i, url in enumerate(matches)]\n\n for w in workers:\n # Don't send all requests at the same time\n time.sleep(1)\n w.start()\n if abort.is_set():\n return\n\n while not abort.is_set():\n a_worker_is_alive = False\n for w in workers:\n w.join(0.2)\n if abort.is_set():\n break\n if w.is_alive():\n a_worker_is_alive = True\n if not a_worker_is_alive:\n break\n\n return None\n # }}}\n\n def filter_result(self, title, authors, identifiers, mi, log): # {{{\n if not self.use_search_engine:\n return True\n if title is not None:\n import regex\n only_punctuation_pat = regex.compile(r'^\\p{P}+$')\n\n def tokenize_title(x):\n ans = icu_lower(x).replace(\"'\", '').replace('\"', '').rstrip(':')\n if only_punctuation_pat.match(ans) is not None:\n ans = ''\n return ans\n\n tokens = {tokenize_title(x) for x in title.split() if len(x) > 3}\n tokens.discard('')\n if tokens:\n result_tokens = {tokenize_title(x) for x in mi.title.split()}\n result_tokens.discard('')\n if not tokens.intersection(result_tokens):\n log('Ignoring result:', mi.title, 'as its title does not match')\n return False\n if authors:\n author_tokens = set()\n for author in authors:\n author_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2}\n result_tokens = set()\n for author in mi.authors:\n result_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2}\n if author_tokens and not author_tokens.intersection(result_tokens):\n log('Ignoring result:', mi.title, 'by', ' & '.join(mi.authors), 'as its author does not match')\n return False\n return True\n # }}}\n\n def download_cover(self, log, result_queue, abort, # {{{\n title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False):\n cached_url = self.get_cached_cover_url(identifiers)\n if cached_url is None:\n log.info('No cached cover found, running identify')\n rq = Queue()\n self.identify(log, rq, abort, title=title, authors=authors,\n identifiers=identifiers)\n if abort.is_set():\n return\n if abort.is_set():\n return\n results = []\n while True:\n try:\n results.append(rq.get_nowait())\n except Empty:\n break\n results.sort(key=self.identify_results_keygen(\n title=title, authors=authors, identifiers=identifiers))\n for mi in results:\n cached_url = self.get_cached_cover_url(mi.identifiers)\n if cached_url is not None:\n break\n if cached_url is None:\n log.info('No cover found')\n return\n\n if abort.is_set():\n return\n log('Downloading cover from:', cached_url)\n br = self.browser\n if self.use_search_engine:\n br = br.clone_browser()\n br.set_current_header('Referer', self.referrer_for_domain(self.domain))\n try:\n time.sleep(1)\n cdata = br.open_novisit(\n cached_url, timeout=timeout).read()\n result_queue.put((self, cdata))\n except:\n log.exception('Failed to download cover from:', cached_url)\n # }}}\n\n\ndef manual_tests(domain, **kw): # {{{\n # To run these test use:\n # calibre-debug -c \"from calibre.ebooks.metadata.sources.amazon import *; manual_tests('com')\"\n from calibre.ebooks.metadata.sources.test import authors_test, comments_test, isbn_test, series_test, test_identify_plugin, title_test\n all_tests = {}\n all_tests['com'] = [ # {{{\n ( # Paperback with series\n {'identifiers': {'amazon': '1423146786'}},\n [title_test('Heroes of Olympus', exact=False), series_test('The Heroes of Olympus', 5)]\n ),\n\n ( # Kindle edition with series\n {'identifiers': {'amazon': 'B0085UEQDO'}},\n [title_test('Three Parts Dead', exact=True),\n series_test('Craft Sequence', 1)]\n ),\n\n ( # + in title and uses id=\"main-image\" for cover\n {'identifiers': {'amazon': '1933988770'}},\n [title_test(\n 'C++ Concurrency in Action: Practical Multithreading', exact=True)]\n ),\n\n\n ( # Different comments markup, using Book Description section\n {'identifiers': {'amazon': '0982514506'}},\n [title_test(\n \"Griffin's Destiny\",\n exact=True),\n comments_test('Jelena'), comments_test('Ashinji'),\n ]\n ),\n\n ( # # in title\n {'title': 'Expert C# 2008 Business Objects',\n 'authors': ['Lhotka']},\n [title_test('Expert C#'),\n authors_test(['Rockford Lhotka'])\n ]\n ),\n\n ( # No specific problems\n {'identifiers': {'isbn': '0743273567'}},\n [title_test('the great gatsby'),\n authors_test(['f. Scott Fitzgerald'])]\n ),\n\n ]\n\n # }}}\n\n all_tests['de'] = [ # {{{\n # series\n (\n {'identifiers': {'isbn': '3499275120'}},\n [title_test('Vespasian: Das Schwert des Tribuns: Historischer Roman',\n exact=False), authors_test(['Robert Fabbri']), series_test('Die Vespasian-Reihe', 1)\n ]\n\n ),\n\n ( # umlaut in title/authors\n {'title': 'Flüsternde Wälder',\n 'authors': ['Nicola Förg']},\n [title_test('Flüsternde Wälder'),\n authors_test(['Nicola Förg'], subset=True)\n ]\n ),\n\n (\n {'identifiers': {'isbn': '9783453314979'}},\n [title_test('Die letzten Wächter: Roman',\n exact=False), authors_test(['Sergej Lukianenko'])\n ]\n\n ),\n\n (\n {'identifiers': {'isbn': '3548283519'}},\n [title_test('Wer Wind Sät: Der Fünfte Fall Für Bodenstein Und Kirchhoff',\n exact=False), authors_test(['Nele Neuhaus'])\n ]\n\n ),\n ] # }}}\n\n all_tests['it'] = [ # {{{\n (\n {'identifiers': {'isbn': '8838922195'}},\n [title_test('La briscola in cinque',\n exact=True), authors_test(['Marco Malvaldi'])\n ]\n\n ),\n ] # }}}\n\n all_tests['fr'] = [ # {{{\n (\n {'identifiers': {'amazon_fr': 'B07L7ST4RS'}},\n [title_test('Le secret de Lola', exact=True),\n authors_test(['Amélie BRIZIO'])\n ]\n ),\n (\n {'identifiers': {'isbn': '2221116798'}},\n [title_test('L\\'étrange voyage de Monsieur Daldry',\n exact=True), authors_test(['Marc Levy'])\n ]\n\n ),\n ] # }}}\n\n all_tests['es'] = [ # {{{\n (\n {'identifiers': {'isbn': '8483460831'}},\n [title_test('Tiempos Interesantes',\n exact=False), authors_test(['Terry Pratchett'])\n ]\n\n ),\n ] # }}}\n\n all_tests['se'] = [ # {{{\n (\n {'identifiers': {'isbn': '9780552140287'}},\n [title_test('Men At Arms: A Discworld Novel: 14',\n exact=False), authors_test(['Terry Pratchett'])\n ]\n\n ),\n ] # }}}\n\n all_tests['jp'] = [ # {{{\n ( # Adult filtering test\n {'identifiers': {'isbn': '4799500066'}},\n [title_test('Bitch Trap'), ]\n ),\n\n ( # isbn -> title, authors\n {'identifiers': {'isbn': '9784101302720'}},\n [title_test('精霊の守り人',\n exact=True), authors_test(['上橋 菜穂子'])\n ]\n ),\n ( # title, authors -> isbn (will use Shift_JIS encoding in query.)\n {'title': '考えない練習',\n 'authors': ['小池 龍之介']},\n [isbn_test('9784093881067'), ]\n ),\n ] # }}}\n\n all_tests['br'] = [ # {{{\n (\n {'title': 'A Ascensão da Sombra'},\n [title_test('A Ascensão da Sombra'), authors_test(['Robert Jordan'])]\n ),\n\n (\n {'title': 'Guerra dos Tronos'},\n [title_test('A Guerra dos Tronos. As Crônicas de Gelo e Fogo - Livro 1'), authors_test(['George R. R. Martin'])\n ]\n\n ),\n ] # }}}\n\n all_tests['nl'] = [ # {{{\n (\n {'title': 'Freakonomics'},\n [title_test('Freakonomics',\n exact=True), authors_test(['Steven Levitt & Stephen Dubner & R. Kuitenbrouwer & O. Brenninkmeijer & A. van Den Berg'])\n ]\n\n ),\n ] # }}}\n\n all_tests['cn'] = [ # {{{\n (\n {'identifiers': {'isbn': '9787115369512'}},\n [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True),\n authors_test(['[美]sam Williams', '邓楠,李凡希'])]\n ),\n (\n {'title': '爱上Raspberry Pi'},\n [title_test('爱上Raspberry Pi',\n exact=True), authors_test(['Matt Richardson', 'Shawn Wallace', '李凡希'])\n ]\n\n ),\n ] # }}}\n\n all_tests['ca'] = [ # {{{\n ( # Paperback with series\n {'identifiers': {'isbn': '9781623808747'}},\n [title_test('Parting Shot', exact=True),\n authors_test(['Mary Calmes'])]\n ),\n ( # # in title\n {'title': 'Expert C# 2008 Business Objects',\n 'authors': ['Lhotka']},\n [title_test('Expert C# 2008 Business Objects'),\n authors_test(['Rockford Lhotka'])]\n ),\n ( # noscript description\n {'identifiers': {'amazon_ca': '162380874X'}},\n [title_test('Parting Shot', exact=True), authors_test(['Mary Calmes'])\n ]\n ),\n ] # }}}\n\n all_tests['in'] = [ # {{{\n ( # Paperback with series\n {'identifiers': {'amazon_in': '1423146786'}},\n [title_test('The Heroes of Olympus, Book Five The Blood of Olympus', exact=True)]\n ),\n ] # }}}\n\n def do_test(domain, start=0, stop=None, server='auto'):\n tests = all_tests[domain]\n if stop is None:\n stop = len(tests)\n tests = tests[start:stop]\n test_identify_plugin(Amazon.name, tests, modify_plugin=lambda p: (\n setattr(p, 'testing_domain', domain),\n setattr(p, 'touched_fields', p.touched_fields - {'tags'}),\n setattr(p, 'testing_server', server),\n ))\n\n do_test(domain, **kw)\n# }}}\n",
|
||
"big_book_search": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__ = 'GPL v3'\n__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nfrom calibre.ebooks.metadata.sources.base import Option, Source\n\n\ndef get_urls(br, tokens):\n from urllib.parse import quote_plus\n\n from html5_parser import parse\n escaped = (quote_plus(x) for x in tokens if x and x.strip())\n q = '+'.join(escaped)\n url = 'https://bigbooksearch.com/please-dont-scrape-my-site-you-will-put-my-api-key-over-the-usage-limit-and-the-site-will-break/books/'+q\n raw = br.open(url).read()\n root = parse(raw.decode('utf-8'))\n urls = [i.get('src') for i in root.xpath('//img[@src]')]\n return urls\n\n\nclass BigBookSearch(Source):\n\n name = 'Big Book Search'\n version = (1, 0, 1)\n minimum_calibre_version = (2, 80, 0)\n description = _('Downloads multiple book covers from Amazon. Useful to find alternate covers.')\n capabilities = frozenset(['cover'])\n can_get_multiple_covers = True\n options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),\n _('The maximum number of covers to process from the search result')),\n )\n supports_gzip_transfer_encoding = True\n\n def download_cover(self, log, result_queue, abort,\n title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n if not title:\n return\n br = self.browser\n tokens = tuple(self.get_title_tokens(title)) + tuple(self.get_author_tokens(authors))\n urls = get_urls(br, tokens)\n self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)\n\n\ndef test():\n import pprint\n\n from calibre import browser\n br = browser()\n urls = get_urls(br, ['consider', 'phlebas', 'banks'])\n pprint.pprint(urls)\n\n\nif __name__ == '__main__':\n test()\n",
|
||
"edelweiss": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__ = 'GPL v3'\n__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nimport re\nimport time\nfrom threading import Thread\n\ntry:\n from queue import Empty, Queue\nexcept ImportError:\n from Queue import Empty, Queue\n\nfrom calibre import as_unicode, random_user_agent\nfrom calibre.ebooks.metadata import check_isbn\nfrom calibre.ebooks.metadata.sources.base import Source\n\n\ndef clean_html(raw):\n from calibre.ebooks.chardet import xml_to_unicode\n from calibre.utils.cleantext import clean_ascii_chars\n return clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True,\n resolve_entities=True, assume_utf8=True)[0])\n\n\ndef parse_html(raw):\n raw = clean_html(raw)\n from html5_parser import parse\n return parse(raw)\n\n\ndef astext(node):\n from lxml import etree\n return etree.tostring(node, method='text', encoding='unicode',\n with_tail=False).strip()\n\n\nclass Worker(Thread): # {{{\n\n def __init__(self, basic_data, relevance, result_queue, br, timeout, log, plugin):\n Thread.__init__(self)\n self.daemon = True\n self.basic_data = basic_data\n self.br, self.log, self.timeout = br, log, timeout\n self.result_queue, self.plugin, self.sku = result_queue, plugin, self.basic_data['sku']\n self.relevance = relevance\n\n def run(self):\n url = ('https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/product/two_Enhanced.ascx&'\n 'sku={0}&idPrefix=content_1_{0}&mode=0'.format(self.sku))\n try:\n raw = self.br.open_novisit(url, timeout=self.timeout).read()\n except:\n self.log.exception('Failed to load comments page: %r'%url)\n return\n\n try:\n mi = self.parse(raw)\n mi.source_relevance = self.relevance\n self.plugin.clean_downloaded_metadata(mi)\n self.result_queue.put(mi)\n except:\n self.log.exception('Failed to parse details for sku: %s'%self.sku)\n\n def parse(self, raw):\n from calibre.ebooks.metadata.book.base import Metadata\n from calibre.utils.date import UNDEFINED_DATE\n root = parse_html(raw)\n mi = Metadata(self.basic_data['title'], self.basic_data['authors'])\n\n # Identifiers\n if self.basic_data['isbns']:\n mi.isbn = self.basic_data['isbns'][0]\n mi.set_identifier('edelweiss', self.sku)\n\n # Tags\n if self.basic_data['tags']:\n mi.tags = self.basic_data['tags']\n mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]\n\n # Publisher\n mi.publisher = self.basic_data['publisher']\n\n # Pubdate\n if self.basic_data['pubdate'] and self.basic_data['pubdate'].year != UNDEFINED_DATE:\n mi.pubdate = self.basic_data['pubdate']\n\n # Rating\n if self.basic_data['rating']:\n mi.rating = self.basic_data['rating']\n\n # Comments\n comments = ''\n for cid in ('summary', 'contributorbio', 'quotes_reviews'):\n cid = 'desc_{}{}-content'.format(cid, self.sku)\n div = root.xpath('//*[@id=\"{}\"]'.format(cid))\n if div:\n comments += self.render_comments(div[0])\n if comments:\n mi.comments = comments\n\n mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None\n return mi\n\n def render_comments(self, desc):\n from lxml import etree\n\n from calibre.library.comments import sanitize_comments_html\n for c in desc.xpath('descendant::noscript'):\n c.getparent().remove(c)\n for a in desc.xpath('descendant::a[@href]'):\n del a.attrib['href']\n a.tag = 'span'\n desc = etree.tostring(desc, method='html', encoding='unicode').strip()\n\n # remove all attributes from tags\n desc = re.sub(r'<([a-zA-Z0-9]+)\\s[^>]+>', r'<\\1>', desc)\n # Collapse whitespace\n # desc = re.sub('\\n+', '\\n', desc)\n # desc = re.sub(' +', ' ', desc)\n # Remove comments\n desc = re.sub(r'(?s)<!--.*?-->', '', desc)\n return sanitize_comments_html(desc)\n# }}}\n\n\ndef get_basic_data(browser, log, *skus):\n from mechanize import Request\n\n from calibre.utils.date import parse_only_date\n zeroes = ','.join('0' for sku in skus)\n data = {\n 'skus': ','.join(skus),\n 'drc': zeroes,\n 'startPosition': '0',\n 'sequence': '1',\n 'selected': zeroes,\n 'itemID': '0',\n 'orderID': '0',\n 'mailingID': '',\n 'tContentWidth': '926',\n 'originalOrder': ','.join(type('')(i) for i in range(len(skus))),\n 'selectedOrderID': '0',\n 'selectedSortColumn': '0',\n 'listType': '1',\n 'resultType': '32',\n 'blockView': '1',\n }\n items_data_url = 'https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/listviews/ListView_Title_Multi.ascx'\n req = Request(items_data_url, data)\n response = browser.open_novisit(req)\n raw = response.read()\n root = parse_html(raw)\n for item in root.xpath('//div[@data-priority]'):\n row = item.getparent().getparent()\n sku = item.get('id').split('-')[-1]\n isbns = [x.strip() for x in row.xpath('descendant::*[contains(@class, \"pev_sku\")]/text()')[0].split(',') if check_isbn(x.strip())]\n isbns.sort(key=len, reverse=True)\n try:\n tags = [x.strip() for x in astext(row.xpath('descendant::*[contains(@class, \"pev_categories\")]')[0]).split('/')]\n except IndexError:\n tags = []\n rating = 0\n for bar in row.xpath('descendant::*[contains(@class, \"bgdColorCommunity\")]/@style'):\n m = re.search(r'width: (\\d+)px;.*max-width: (\\d+)px', bar)\n if m is not None:\n rating = float(m.group(1)) / float(m.group(2))\n break\n try:\n pubdate = parse_only_date(astext(row.xpath('descendant::*[contains(@class, \"pev_shipDate\")]')[0]\n ).split(':')[-1].split(u'\\xa0')[-1].strip(), assume_utc=True)\n except Exception:\n log.exception('Error parsing published date')\n pubdate = None\n authors = []\n for x in [x.strip() for x in row.xpath('descendant::*[contains(@class, \"pev_contributor\")]/@title')]:\n authors.extend(a.strip() for a in x.split(','))\n entry = {\n 'sku': sku,\n 'cover': row.xpath('descendant::img/@src')[0].split('?')[0],\n 'publisher': astext(row.xpath('descendant::*[contains(@class, \"headerPublisher\")]')[0]),\n 'title': astext(row.xpath('descendant::*[@id=\"title_{}\"]'.format(sku))[0]),\n 'authors': authors,\n 'isbns': isbns,\n 'tags': tags,\n 'pubdate': pubdate,\n 'format': ' '.join(row.xpath('descendant::*[contains(@class, \"pev_format\")]/text()')).strip(),\n 'rating': rating,\n }\n if entry['cover'].startswith('/'):\n entry['cover'] = None\n yield entry\n\n\nclass Edelweiss(Source):\n\n name = 'Edelweiss'\n version = (2, 0, 1)\n minimum_calibre_version = (3, 6, 0)\n description = _('Downloads metadata and covers from Edelweiss - A catalog updated by book publishers')\n\n capabilities = frozenset(['identify', 'cover'])\n touched_fields = frozenset([\n 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',\n 'identifier:isbn', 'identifier:edelweiss', 'rating'])\n supports_gzip_transfer_encoding = True\n has_html_comments = True\n\n @property\n def user_agent(self):\n # Pass in an index to random_user_agent() to test with a particular\n # user agent\n return random_user_agent(allow_ie=False)\n\n def _get_book_url(self, sku):\n if sku:\n return 'https://www.edelweiss.plus/#sku={}&page=1'.format(sku)\n\n def get_book_url(self, identifiers): # {{{\n sku = identifiers.get('edelweiss', None)\n if sku:\n return 'edelweiss', sku, self._get_book_url(sku)\n\n # }}}\n\n def get_cached_cover_url(self, identifiers): # {{{\n sku = identifiers.get('edelweiss', None)\n if not sku:\n isbn = identifiers.get('isbn', None)\n if isbn is not None:\n sku = self.cached_isbn_to_identifier(isbn)\n return self.cached_identifier_to_cover_url(sku)\n # }}}\n\n def create_query(self, log, title=None, authors=None, identifiers={}):\n try:\n from urllib.parse import urlencode\n except ImportError:\n from urllib import urlencode\n import time\n BASE_URL = ('https://www.edelweiss.plus/GetTreelineControl.aspx?'\n 'controlName=/uc/listviews/controls/ListView_data.ascx&itemID=0&resultType=32&dashboardType=8&itemType=1&dataType=products&keywordSearch&')\n keywords = []\n isbn = check_isbn(identifiers.get('isbn', None))\n if isbn is not None:\n keywords.append(isbn)\n elif title:\n title_tokens = list(self.get_title_tokens(title))\n if title_tokens:\n keywords.extend(title_tokens)\n author_tokens = self.get_author_tokens(authors, only_first_author=True)\n if author_tokens:\n keywords.extend(author_tokens)\n if not keywords:\n return None\n params = {\n 'q': (' '.join(keywords)).encode('utf-8'),\n '_': type('')(int(time.time()))\n }\n return BASE_URL+urlencode(params)\n\n # }}}\n\n def identify(self, log, result_queue, abort, title=None, authors=None, # {{{\n identifiers={}, timeout=30):\n import json\n\n br = self.browser\n br.addheaders = [\n ('Referer', 'https://www.edelweiss.plus/'),\n ('X-Requested-With', 'XMLHttpRequest'),\n ('Cache-Control', 'no-cache'),\n ('Pragma', 'no-cache'),\n ]\n if 'edelweiss' in identifiers:\n items = [identifiers['edelweiss']]\n else:\n log.error('Currently Edelweiss returns random books for search queries')\n return\n query = self.create_query(log, title=title, authors=authors,\n identifiers=identifiers)\n if not query:\n log.error('Insufficient metadata to construct query')\n return\n log('Using query URL:', query)\n try:\n raw = br.open(query, timeout=timeout).read().decode('utf-8')\n except Exception as e:\n log.exception('Failed to make identify query: %r'%query)\n return as_unicode(e)\n items = re.search(r'window[.]items\\s*=\\s*(.+?);', raw)\n if items is None:\n log.error('Failed to get list of matching items')\n log.debug('Response text:')\n log.debug(raw)\n return\n items = json.loads(items.group(1))\n\n if (not items and identifiers and title and authors and\n not abort.is_set()):\n return self.identify(log, result_queue, abort, title=title,\n authors=authors, timeout=timeout)\n\n if not items:\n return\n\n workers = []\n items = items[:5]\n for i, item in enumerate(get_basic_data(self.browser, log, *items)):\n sku = item['sku']\n for isbn in item['isbns']:\n self.cache_isbn_to_identifier(isbn, sku)\n if item['cover']:\n self.cache_identifier_to_cover_url(sku, item['cover'])\n fmt = item['format'].lower()\n if 'audio' in fmt or 'mp3' in fmt:\n continue # Audio-book, ignore\n workers.append(Worker(item, i, result_queue, br.clone_browser(), timeout, log, self))\n\n if not workers:\n return\n\n for w in workers:\n w.start()\n # Don't send all requests at the same time\n time.sleep(0.1)\n\n while not abort.is_set():\n a_worker_is_alive = False\n for w in workers:\n w.join(0.2)\n if abort.is_set():\n break\n if w.is_alive():\n a_worker_is_alive = True\n if not a_worker_is_alive:\n break\n\n # }}}\n\n def download_cover(self, log, result_queue, abort, # {{{\n title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n cached_url = self.get_cached_cover_url(identifiers)\n if cached_url is None:\n log.info('No cached cover found, running identify')\n rq = Queue()\n self.identify(log, rq, abort, title=title, authors=authors,\n identifiers=identifiers)\n if abort.is_set():\n return\n results = []\n while True:\n try:\n results.append(rq.get_nowait())\n except Empty:\n break\n results.sort(key=self.identify_results_keygen(\n title=title, authors=authors, identifiers=identifiers))\n for mi in results:\n cached_url = self.get_cached_cover_url(mi.identifiers)\n if cached_url is not None:\n break\n if cached_url is None:\n log.info('No cover found')\n return\n\n if abort.is_set():\n return\n br = self.browser\n log('Downloading cover from:', cached_url)\n try:\n cdata = br.open_novisit(cached_url, timeout=timeout).read()\n result_queue.put((self, cdata))\n except:\n log.exception('Failed to download cover from:', cached_url)\n # }}}\n\n\nif __name__ == '__main__':\n from calibre.ebooks.metadata.sources.test import authors_test, comments_test, pubdate_test, test_identify_plugin, title_test\n tests = [\n ( # A title and author search\n {'title': 'The Husband\\'s Secret', 'authors':['Liane Moriarty']},\n [title_test('The Husband\\'s Secret', exact=True),\n authors_test(['Liane Moriarty'])]\n ),\n\n ( # An isbn present in edelweiss\n {'identifiers':{'isbn': '9780312621360'}, },\n [title_test('Flame: A Sky Chasers Novel', exact=True),\n authors_test(['Amy Kathleen Ryan'])]\n ),\n\n # Multiple authors and two part title and no general description\n ({'identifiers':{'edelweiss':'0321180607'}},\n [title_test(\n \"XQuery From the Experts: A Guide to the W3C XML Query Language\"\n , exact=True), authors_test([\n 'Howard Katz', 'Don Chamberlin', 'Denise Draper', 'Mary Fernandez',\n 'Michael Kay', 'Jonathan Robie', 'Michael Rys', 'Jerome Simeon',\n 'Jim Tivy', 'Philip Wadler']), pubdate_test(2003, 8, 22),\n comments_test('Jérôme Siméon'), lambda mi: bool(mi.comments and 'No title summary' not in mi.comments)\n ]),\n ]\n start, stop = 0, len(tests)\n\n tests = tests[start:stop]\n test_identify_plugin(Edelweiss.name, tests)\n",
|
||
"google": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai\n# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport hashlib\nimport os\nimport re\nimport sys\nimport tempfile\nimport time\n\nimport regex\n\ntry:\n from queue import Empty, Queue\nexcept ImportError:\n from Queue import Empty, Queue\n\nfrom calibre import as_unicode, prepare_string_for_xml, replace_entities\nfrom calibre.ebooks.chardet import xml_to_unicode\nfrom calibre.ebooks.metadata import authors_to_string, check_isbn\nfrom calibre.ebooks.metadata.book.base import Metadata\nfrom calibre.ebooks.metadata.sources.base import Source\nfrom calibre.utils.cleantext import clean_ascii_chars\nfrom calibre.utils.localization import canonicalize_lang\n\nNAMESPACES = {\n 'openSearch': 'http://a9.com/-/spec/opensearchrss/1.0/',\n 'atom': 'http://www.w3.org/2005/Atom',\n 'dc': 'http://purl.org/dc/terms',\n 'gd': 'http://schemas.google.com/g/2005'\n}\n\n\ndef pretty_google_books_comments(raw):\n raw = replace_entities(raw)\n # Paragraphs in the comments are removed but whatever software googl uses\n # to do this does not insert a space so we often find the pattern\n # word.Capital in the comments which can be used to find paragraph markers.\n parts = []\n for x in re.split(r'([a-z)\"”])(\\.)([A-Z(\"“])', raw):\n if x == '.':\n parts.append('.</p>\\n\\n<p>')\n else:\n parts.append(prepare_string_for_xml(x))\n raw = '<p>' + ''.join(parts) + '</p>'\n return raw\n\n\ndef get_details(browser, url, timeout): # {{{\n try:\n raw = browser.open_novisit(url, timeout=timeout).read()\n except Exception as e:\n gc = getattr(e, 'getcode', lambda: -1)\n if gc() != 403:\n raise\n # Google is throttling us, wait a little\n time.sleep(2)\n raw = browser.open_novisit(url, timeout=timeout).read()\n\n return raw\n\n\n# }}}\n\nxpath_cache = {}\n\n\ndef XPath(x):\n ans = xpath_cache.get(x)\n if ans is None:\n from lxml import etree\n ans = xpath_cache[x] = etree.XPath(x, namespaces=NAMESPACES)\n return ans\n\n\ndef to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{\n from lxml import etree\n\n # total_results = XPath('//openSearch:totalResults')\n # start_index = XPath('//openSearch:startIndex')\n # items_per_page = XPath('//openSearch:itemsPerPage')\n entry = XPath('//atom:entry')\n entry_id = XPath('descendant::atom:id')\n url = XPath('descendant::atom:link[@rel=\"self\"]/@href')\n creator = XPath('descendant::dc:creator')\n identifier = XPath('descendant::dc:identifier')\n title = XPath('descendant::dc:title')\n date = XPath('descendant::dc:date')\n publisher = XPath('descendant::dc:publisher')\n subject = XPath('descendant::dc:subject')\n description = XPath('descendant::dc:description')\n language = XPath('descendant::dc:language')\n\n # print(etree.tostring(entry_, pretty_print=True))\n\n def get_text(extra, x):\n try:\n ans = x(extra)\n if ans:\n ans = ans[0].text\n if ans and ans.strip():\n return ans.strip()\n except:\n log.exception('Programming error:')\n return None\n\n def get_extra_details():\n raw = get_details(browser, details_url, timeout)\n if running_a_test:\n with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f:\n f.write(raw)\n print('Book details saved to:', f.name, file=sys.stderr)\n feed = etree.fromstring(\n xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],\n parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)\n )\n return entry(feed)[0]\n\n if isinstance(entry_, str):\n google_id = entry_\n details_url = 'https://www.google.com/books/feeds/volumes/' + google_id\n extra = get_extra_details()\n title_ = ': '.join([x.text for x in title(extra)]).strip()\n authors = [x.text.strip() for x in creator(extra) if x.text]\n else:\n id_url = entry_id(entry_)[0].text\n google_id = id_url.split('/')[-1]\n details_url = url(entry_)[0]\n title_ = ': '.join([x.text for x in title(entry_)]).strip()\n authors = [x.text.strip() for x in creator(entry_) if x.text]\n if not id_url or not title:\n # Silently discard this entry\n return None\n extra = None\n\n if not authors:\n authors = [_('Unknown')]\n if not title:\n return None\n if extra is None:\n extra = get_extra_details()\n mi = Metadata(title_, authors)\n mi.identifiers = {'google': google_id}\n mi.comments = get_text(extra, description)\n lang = canonicalize_lang(get_text(extra, language))\n if lang:\n mi.language = lang\n mi.publisher = get_text(extra, publisher)\n\n # ISBN\n isbns = []\n for x in identifier(extra):\n t = type('')(x.text).strip()\n if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):\n if t[:5].upper() == 'ISBN:':\n t = check_isbn(t[5:])\n if t:\n isbns.append(t)\n if isbns:\n mi.isbn = sorted(isbns, key=len)[-1]\n mi.all_isbns = isbns\n\n # Tags\n try:\n btags = [x.text for x in subject(extra) if x.text]\n tags = []\n for t in btags:\n atags = [y.strip() for y in t.split('/')]\n for tag in atags:\n if tag not in tags:\n tags.append(tag)\n except:\n log.exception('Failed to parse tags:')\n tags = []\n if tags:\n mi.tags = [x.replace(',', ';') for x in tags]\n\n # pubdate\n pubdate = get_text(extra, date)\n if pubdate:\n from calibre.utils.date import parse_date, utcnow\n try:\n default = utcnow().replace(day=15)\n mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)\n except:\n log.error('Failed to parse pubdate %r' % pubdate)\n\n # Cover\n mi.has_google_cover = None\n for x in extra.xpath(\n '//*[@href and @rel=\"http://schemas.google.com/books/2008/thumbnail\"]'\n ):\n mi.has_google_cover = x.get('href')\n break\n\n return mi\n\n\n# }}}\n\n\nclass GoogleBooks(Source):\n\n name = 'Google'\n version = (1, 1, 1)\n minimum_calibre_version = (2, 80, 0)\n description = _('Downloads metadata and covers from Google Books')\n\n capabilities = frozenset({'identify'})\n touched_fields = frozenset({\n 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',\n 'identifier:isbn', 'identifier:google', 'languages'\n })\n supports_gzip_transfer_encoding = True\n cached_cover_url_is_reliable = False\n\n GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1'\n\n DUMMY_IMAGE_MD5 = frozenset(\n ('0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f')\n )\n\n def get_book_url(self, identifiers): # {{{\n goog = identifiers.get('google', None)\n if goog is not None:\n return ('google', goog, 'https://books.google.com/books?id=%s' % goog)\n # }}}\n\n def id_from_url(self, url): # {{{\n from polyglot.urllib import parse_qs, urlparse\n purl = urlparse(url)\n if purl.netloc == 'books.google.com':\n q = parse_qs(purl.query)\n gid = q.get('id')\n if gid:\n return 'google', gid[0]\n # }}}\n\n def create_query(self, title=None, authors=None, identifiers={}, capitalize_isbn=False): # {{{\n try:\n from urllib.parse import urlencode\n except ImportError:\n from urllib import urlencode\n BASE_URL = 'https://books.google.com/books/feeds/volumes?'\n isbn = check_isbn(identifiers.get('isbn', None))\n q = ''\n if isbn is not None:\n q += ('ISBN:' if capitalize_isbn else 'isbn:') + isbn\n elif title or authors:\n\n def build_term(prefix, parts):\n return ' '.join('in' + prefix + ':' + x for x in parts)\n\n title_tokens = list(self.get_title_tokens(title))\n if title_tokens:\n q += build_term('title', title_tokens)\n author_tokens = list(self.get_author_tokens(authors, only_first_author=True))\n if author_tokens:\n q += ('+' if q else '') + build_term('author', author_tokens)\n\n if not q:\n return None\n if not isinstance(q, bytes):\n q = q.encode('utf-8')\n return BASE_URL + urlencode({\n 'q': q,\n 'max-results': 20,\n 'start-index': 1,\n 'min-viewability': 'none',\n })\n\n # }}}\n\n def download_cover( # {{{\n self,\n log,\n result_queue,\n abort,\n title=None,\n authors=None,\n identifiers={},\n timeout=30,\n get_best_cover=False\n ):\n cached_url = self.get_cached_cover_url(identifiers)\n if cached_url is None:\n log.info('No cached cover found, running identify')\n rq = Queue()\n self.identify(\n log,\n rq,\n abort,\n title=title,\n authors=authors,\n identifiers=identifiers\n )\n if abort.is_set():\n return\n results = []\n while True:\n try:\n results.append(rq.get_nowait())\n except Empty:\n break\n results.sort(\n key=self.identify_results_keygen(\n title=title, authors=authors, identifiers=identifiers\n )\n )\n for mi in results:\n cached_url = self.get_cached_cover_url(mi.identifiers)\n if cached_url is not None:\n break\n if cached_url is None:\n log.info('No cover found')\n return\n\n br = self.browser\n for candidate in (0, 1):\n if abort.is_set():\n return\n url = cached_url + '&zoom={}'.format(candidate)\n log('Downloading cover from:', cached_url)\n try:\n cdata = br.open_novisit(url, timeout=timeout).read()\n if cdata:\n if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5:\n log.warning('Google returned a dummy image, ignoring')\n else:\n result_queue.put((self, cdata))\n break\n except Exception:\n log.exception('Failed to download cover from:', cached_url)\n\n # }}}\n\n def get_cached_cover_url(self, identifiers): # {{{\n url = None\n goog = identifiers.get('google', None)\n if goog is None:\n isbn = identifiers.get('isbn', None)\n if isbn is not None:\n goog = self.cached_isbn_to_identifier(isbn)\n if goog is not None:\n url = self.cached_identifier_to_cover_url(goog)\n\n return url\n\n # }}}\n\n def postprocess_downloaded_google_metadata(self, ans, relevance=0): # {{{\n if not isinstance(ans, Metadata):\n return ans\n ans.source_relevance = relevance\n goog = ans.identifiers['google']\n for isbn in getattr(ans, 'all_isbns', []):\n self.cache_isbn_to_identifier(isbn, goog)\n if getattr(ans, 'has_google_cover', False):\n self.cache_identifier_to_cover_url(goog, self.GOOGLE_COVER % goog)\n if ans.comments:\n ans.comments = pretty_google_books_comments(ans.comments)\n self.clean_downloaded_metadata(ans)\n return ans\n # }}}\n\n def get_all_details( # {{{\n self,\n br,\n log,\n entries,\n abort,\n result_queue,\n timeout\n ):\n from lxml import etree\n for relevance, i in enumerate(entries):\n try:\n ans = self.postprocess_downloaded_google_metadata(to_metadata(br, log, i, timeout, self.running_a_test), relevance)\n if isinstance(ans, Metadata):\n result_queue.put(ans)\n except Exception:\n log.exception(\n 'Failed to get metadata for identify entry:', etree.tostring(i)\n )\n if abort.is_set():\n break\n\n # }}}\n\n def identify_via_web_search( # {{{\n self,\n log,\n result_queue,\n abort,\n title=None,\n authors=None,\n identifiers={},\n timeout=30\n ):\n from calibre.utils.filenames import ascii_text\n isbn = check_isbn(identifiers.get('isbn', None))\n q = []\n strip_punc_pat = regex.compile(r'[\\p{C}|\\p{M}|\\p{P}|\\p{S}|\\p{Z}]+', regex.UNICODE)\n google_ids = []\n check_tokens = set()\n has_google_id = 'google' in identifiers\n\n def to_check_tokens(*tokens):\n for t in tokens:\n if len(t) < 3:\n continue\n t = t.lower()\n if t in ('and', 'not', 'the'):\n continue\n yield ascii_text(strip_punc_pat.sub('', t))\n\n if has_google_id:\n google_ids.append(identifiers['google'])\n elif isbn is not None:\n q.append(isbn)\n elif title or authors:\n title_tokens = list(self.get_title_tokens(title))\n if title_tokens:\n q += title_tokens\n check_tokens |= set(to_check_tokens(*title_tokens))\n author_tokens = list(self.get_author_tokens(authors, only_first_author=True))\n if author_tokens:\n q += author_tokens\n check_tokens |= set(to_check_tokens(*author_tokens))\n if not q and not google_ids:\n return None\n from calibre.ebooks.metadata.sources.update import search_engines_module\n se = search_engines_module()\n br = se.google_specialize_browser(se.browser())\n if not has_google_id:\n url = se.google_format_query(q, tbm='bks')\n log('Making query:', url)\n r = []\n root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append)\n pat = re.compile(r'id=([^&]+)')\n for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False):\n m = pat.search(q.url)\n if m is None or not q.url.startswith('https://books.google'):\n continue\n google_ids.append(m.group(1))\n\n if not google_ids and isbn and (title or authors):\n return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)\n found = False\n seen = set()\n for relevance, gid in enumerate(google_ids):\n if gid in seen:\n continue\n seen.add(gid)\n try:\n ans = to_metadata(br, log, gid, timeout, self.running_a_test)\n if isinstance(ans, Metadata):\n if isbn:\n if isbn not in ans.all_isbns:\n log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the ISBN:', isbn,\n 'not in', ' '.join(ans.all_isbns))\n continue\n elif check_tokens:\n candidate = set(to_check_tokens(*self.get_title_tokens(ans.title)))\n candidate |= set(to_check_tokens(*self.get_author_tokens(ans.authors)))\n if candidate.intersection(check_tokens) != check_tokens:\n log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the query')\n continue\n ans = self.postprocess_downloaded_google_metadata(ans, relevance)\n result_queue.put(ans)\n found = True\n except:\n log.exception('Failed to get metadata for google books id:', gid)\n if abort.is_set():\n break\n if not found and isbn and (title or authors):\n return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)\n # }}}\n\n def identify( # {{{\n self,\n log,\n result_queue,\n abort,\n title=None,\n authors=None,\n identifiers={},\n timeout=30\n ):\n from lxml import etree\n entry = XPath('//atom:entry')\n identifiers = identifiers.copy()\n br = self.browser\n if 'google' in identifiers:\n try:\n ans = to_metadata(br, log, identifiers['google'], timeout, self.running_a_test)\n if isinstance(ans, Metadata):\n self.postprocess_downloaded_google_metadata(ans)\n result_queue.put(ans)\n return\n except Exception:\n log.exception('Failed to get metadata for Google identifier:', identifiers['google'])\n del identifiers['google']\n\n query = self.create_query(\n title=title, authors=authors, identifiers=identifiers\n )\n if not query:\n log.error('Insufficient metadata to construct query')\n return\n\n def make_query(query):\n log('Making query:', query)\n try:\n raw = br.open_novisit(query, timeout=timeout).read()\n except Exception as e:\n log.exception('Failed to make identify query: %r' % query)\n return False, as_unicode(e)\n\n try:\n feed = etree.fromstring(\n xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],\n parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)\n )\n return True, entry(feed)\n except Exception as e:\n log.exception('Failed to parse identify results')\n return False, as_unicode(e)\n ok, entries = make_query(query)\n if not ok:\n return entries\n if not entries and not abort.is_set():\n log('No results found, doing a web search instead')\n return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout)\n\n # There is no point running these queries in threads as google\n # throttles requests returning 403 Forbidden errors\n self.get_all_details(br, log, entries, abort, result_queue, timeout)\n\n # }}}\n\n\nif __name__ == '__main__': # tests {{{\n # To run these test use:\n # calibre-debug src/calibre/ebooks/metadata/sources/google.py\n from calibre.ebooks.metadata.sources.test import authors_test, test_identify_plugin, title_test\n tests = [\n ({\n 'identifiers': {'google': 's7NIrgEACAAJ'},\n }, [title_test('Ride Every Stride', exact=False)]),\n\n ({\n 'identifiers': {'isbn': '0743273567'},\n 'title': 'Great Gatsby',\n 'authors': ['Fitzgerald']\n }, [\n title_test('The great gatsby', exact=True),\n authors_test(['F. Scott Fitzgerald'])\n ]),\n\n ({\n 'title': 'Flatland',\n 'authors': ['Abbott']\n }, [title_test('Flatland', exact=False)]),\n\n ({\n 'title': 'The Blood Red Indian Summer: A Berger and Mitry Mystery',\n 'authors': ['David Handler'],\n }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')\n ]),\n\n ({\n # requires using web search to find the book\n 'title': 'Dragon Done It',\n 'authors': ['Eric Flint'],\n }, [\n title_test('The dragon done it', exact=True),\n authors_test(['Eric Flint', 'Mike Resnick'])\n ]),\n\n ]\n test_identify_plugin(GoogleBooks.name, tests[:])\n\n# }}}\n",
|
||
"google_images": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__ = 'GPL v3'\n__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nfrom collections import OrderedDict\n\nfrom calibre import random_user_agent\nfrom calibre.ebooks.metadata.sources.base import Option, Source\n\n\ndef parse_html(raw):\n try:\n from html5_parser import parse\n except ImportError:\n # Old versions of calibre\n import html5lib\n return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)\n else:\n return parse(raw)\n\n\ndef imgurl_from_id(raw, tbnid):\n from json import JSONDecoder\n q = '\"{}\",['.format(tbnid)\n start_pos = raw.index(q)\n if start_pos < 100:\n return\n jd = JSONDecoder()\n data = jd.raw_decode('[' + raw[start_pos:])[0]\n # from pprint import pprint\n # pprint(data)\n url_num = 0\n for x in data:\n if isinstance(x, list) and len(x) == 3:\n q = x[0]\n if hasattr(q, 'lower') and q.lower().startswith('http'):\n url_num += 1\n if url_num > 1:\n return q\n\n\ndef parse_google_markup(raw):\n root = parse_html(raw)\n # newer markup pages use data-docid not data-tbnid\n results = root.xpath('//div/@data-tbnid') or root.xpath('//div/@data-docid')\n ans = OrderedDict()\n for tbnid in results:\n try:\n imgurl = imgurl_from_id(raw, tbnid)\n except Exception:\n continue\n if imgurl:\n ans[imgurl] = True\n return list(ans)\n\n\n\nclass GoogleImages(Source):\n\n name = 'Google Images'\n version = (1, 0, 6)\n minimum_calibre_version = (2, 80, 0)\n description = _('Downloads covers from a Google Image search. Useful to find larger/alternate covers.')\n capabilities = frozenset(['cover'])\n can_get_multiple_covers = True\n supports_gzip_transfer_encoding = True\n options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),\n _('The maximum number of covers to process from the Google search result')),\n Option('size', 'choices', 'svga', _('Cover size'),\n _('Search for covers larger than the specified size'),\n choices=OrderedDict((\n ('any', _('Any size'),),\n ('l', _('Large'),),\n ('qsvga', _('Larger than %s')%'400x300',),\n ('vga', _('Larger than %s')%'640x480',),\n ('svga', _('Larger than %s')%'600x800',),\n ('xga', _('Larger than %s')%'1024x768',),\n ('2mp', _('Larger than %s')%'2 MP',),\n ('4mp', _('Larger than %s')%'4 MP',),\n ))),\n )\n\n def download_cover(self, log, result_queue, abort,\n title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n if not title:\n return\n timeout = max(60, timeout) # Needs at least a minute\n title = ' '.join(self.get_title_tokens(title))\n author = ' '.join(self.get_author_tokens(authors))\n urls = self.get_image_urls(title, author, log, abort, timeout)\n self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)\n\n @property\n def user_agent(self):\n return random_user_agent(allow_ie=False)\n\n def get_image_urls(self, title, author, log, abort, timeout):\n from calibre.utils.cleantext import clean_ascii_chars\n try:\n from urllib.parse import urlencode\n except ImportError:\n from urllib import urlencode\n br = self.browser\n q = urlencode({'as_q': ('%s %s'%(title, author)).encode('utf-8')})\n if isinstance(q, bytes):\n q = q.decode('utf-8')\n sz = self.prefs['size']\n if sz == 'any':\n sz = ''\n elif sz == 'l':\n sz = 'isz:l,'\n else:\n sz = 'isz:lt,islt:%s,' % sz\n # See https://www.google.com/advanced_image_search to understand this\n # URL scheme\n url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz)\n log('Search URL: ' + url)\n # See https://github.com/benbusby/whoogle-search/pull/1054 for cookies\n br.set_simple_cookie('CONSENT', 'PENDING+987', '.google.com', path='/')\n template = b'\\x08\\x01\\x128\\x08\\x14\\x12+boq_identityfrontenduiserver_20231107.05_p0\\x1a\\x05en-US \\x03\\x1a\\x06\\x08\\x80\\xf1\\xca\\xaa\\x06'\n from base64 import standard_b64encode\n from datetime import date\n template.replace(b'20231107', date.today().strftime('%Y%m%d').encode('ascii'))\n br.set_simple_cookie('SOCS', standard_b64encode(template).decode('ascii').rstrip('='), '.google.com', path='/')\n # br.set_debug_http(True)\n raw = clean_ascii_chars(br.open(url).read().decode('utf-8'))\n # with open('/t/raw.html', 'w') as f:\n # f.write(raw)\n return parse_google_markup(raw)\n\n\ndef test_raw():\n import sys\n raw = open(sys.argv[-1]).read()\n for x in parse_google_markup(raw):\n print(x)\n\n\ndef test(title='Star Trek: Section 31: Control', authors=('David Mack',)):\n try:\n from queue import Queue\n except ImportError:\n from Queue import Queue\n from threading import Event\n\n from calibre.utils.logging import default_log\n p = GoogleImages(None)\n p.log = default_log\n rq = Queue()\n p.download_cover(default_log, rq, Event(), title=title, authors=authors)\n print('Downloaded', rq.qsize(), 'covers')\n\n\nif __name__ == '__main__':\n test()\n",
|
||
"hashes": {
|
||
"amazon": "1a9880d3ea36306d6ff5bfb46c605c6e6e81c02a",
|
||
"big_book_search": "7a8b67c0f19ecbfe8a9d28b961aab1119f31c3e3",
|
||
"edelweiss": "a8ec3d6919265c52d896d4688e366302495cc525",
|
||
"google": "dd793082f1ba0aba3157197487aae8f147cf05cf",
|
||
"google_images": "173685f099e904071f8d566709a446240e26a968",
|
||
"openlibrary": "8707d3b3161de476b46ed967dea1116707dcfb0a",
|
||
"search_engines": "3a3c25fc0bcd7955078caf4c9c0e1a1a1dd6478e"
|
||
},
|
||
"openlibrary": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__ = 'GPL v3'\n__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nfrom calibre.ebooks.metadata.sources.base import Source\n\n\nclass OpenLibrary(Source):\n\n name = 'Open Library'\n version = (1, 0, 0)\n minimum_calibre_version = (2, 80, 0)\n description = _('Downloads covers from The Open Library')\n\n capabilities = frozenset(['cover'])\n\n OPENLIBRARY = 'https://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'\n\n def download_cover(self, log, result_queue, abort,\n title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n if 'isbn' not in identifiers:\n return\n isbn = identifiers['isbn']\n br = self.browser\n try:\n ans = br.open_novisit(self.OPENLIBRARY%isbn, timeout=timeout).read()\n result_queue.put((self, ans))\n except Exception as e:\n if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:\n log.error('No cover for ISBN: %r found'%isbn)\n else:\n log.exception('Failed to download cover for ISBN:', isbn)\n",
|
||
"search_engines": "#!/usr/bin/env python\n# vim:fileencoding=utf-8\n# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>\n\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport json\nimport os\nimport re\nimport sys\nimport time\nfrom collections import namedtuple\nfrom contextlib import contextmanager\nfrom functools import partial\nfrom threading import Lock\n\ntry:\n from urllib.parse import parse_qs, quote, quote_plus, urlencode, urlparse\nexcept ImportError:\n from urllib import quote, quote_plus, urlencode\n\n from urlparse import parse_qs, urlparse\n\nfrom lxml import etree\n\nfrom calibre import browser as _browser\nfrom calibre import prints as safe_print\nfrom calibre import random_user_agent\nfrom calibre.constants import cache_dir\nfrom calibre.ebooks.chardet import xml_to_unicode\nfrom calibre.utils.lock import ExclusiveFile\nfrom calibre.utils.random_ua import accept_header_for_ua\n\ncurrent_version = (1, 2, 9)\nminimum_calibre_version = (2, 80, 0)\nwebcache = {}\nwebcache_lock = Lock()\nprints = partial(safe_print, file=sys.stderr)\n\n\nResult = namedtuple('Result', 'url title cached_url')\n\n\n@contextmanager\ndef rate_limit(name='test', time_between_visits=2, max_wait_seconds=5 * 60, sleep_time=0.2):\n lock_file = os.path.join(cache_dir(), 'search-engine.' + name + '.lock')\n with ExclusiveFile(lock_file, timeout=max_wait_seconds, sleep_time=sleep_time) as f:\n try:\n lv = float(f.read().decode('utf-8').strip())\n except Exception:\n lv = 0\n # we cannot use monotonic() as this is cross process and historical\n # data as well\n delta = time.time() - lv\n if delta < time_between_visits:\n time.sleep(time_between_visits - delta)\n try:\n yield\n finally:\n f.seek(0)\n f.truncate()\n f.write(repr(time.time()).encode('utf-8'))\n\n\ndef tostring(elem):\n return etree.tostring(elem, encoding='unicode', method='text', with_tail=False)\n\n\ndef browser():\n ua = random_user_agent(allow_ie=False)\n # ua = 'Mozilla/5.0 (Linux; Android 8.0.0; VTR-L29; rv:63.0) Gecko/20100101 Firefox/63.0'\n br = _browser(user_agent=ua)\n br.set_handle_gzip(True)\n br.addheaders += [\n ('Accept', accept_header_for_ua(ua)),\n ('Upgrade-insecure-requests', '1'),\n ]\n return br\n\n\ndef encode_query(**query):\n q = {k.encode('utf-8'): v.encode('utf-8') for k, v in query.items()}\n return urlencode(q).decode('utf-8')\n\n\ndef parse_html(raw):\n try:\n from html5_parser import parse\n except ImportError:\n # Old versions of calibre\n import html5lib\n return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)\n else:\n return parse(raw)\n\n\ndef query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None, simple_scraper=None):\n with rate_limit(key):\n if simple_scraper is None:\n raw = br.open_novisit(url, timeout=timeout).read()\n raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]\n else:\n raw = simple_scraper(url, timeout=timeout)\n if dump_raw is not None:\n with open(dump_raw, 'w') as f:\n f.write(raw)\n if save_raw is not None:\n save_raw(raw)\n return parser(raw)\n\n\ndef quote_term(x):\n ans = quote_plus(x.encode('utf-8'))\n if isinstance(ans, bytes):\n ans = ans.decode('utf-8')\n return ans\n\n\n# DDG + Wayback machine {{{\n\n\ndef ddg_url_processor(url):\n return url\n\n\ndef ddg_term(t):\n t = t.replace('\"', '')\n if t.lower() in {'map', 'news'}:\n t = '\"' + t + '\"'\n if t in {'OR', 'AND', 'NOT'}:\n t = t.lower()\n return t\n\n\ndef ddg_href(url):\n if url.startswith('/'):\n q = url.partition('?')[2]\n url = parse_qs(q.encode('utf-8'))['uddg'][0].decode('utf-8')\n return url\n\n\ndef wayback_machine_cached_url(url, br=None, log=prints, timeout=60):\n q = quote_term(url)\n br = br or browser()\n data = query(br, 'https://archive.org/wayback/available?url=' +\n q, 'wayback', parser=json.loads, limit=0.25, timeout=timeout)\n try:\n closest = data['archived_snapshots']['closest']\n if closest['available']:\n ans = closest['url'].replace('http:', 'https:', 1)\n # get unmodified HTML\n ans = ans.replace(closest['timestamp'], closest['timestamp'] + 'id_', 1)\n return ans\n except Exception:\n pass\n from pprint import pformat\n log('Response from wayback machine:', pformat(data))\n\n\ndef wayback_url_processor(url):\n if url.startswith('/'):\n # Use original URL instead of absolutizing to wayback URL as wayback is\n # slow\n m = re.search('https?:', url)\n if m is None:\n url = 'https://web.archive.org' + url\n else:\n url = url[m.start():]\n return url\n\n\nddg_scraper_storage = []\n\n\ndef ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):\n # https://duck.co/help/results/syntax\n terms = [quote_term(ddg_term(t)) for t in terms]\n if site is not None:\n terms.append(quote_term(('site:' + site)))\n q = '+'.join(terms)\n url = 'https://duckduckgo.com/html/?q={q}&kp={kp}'.format(\n q=q, kp=1 if safe_search else -1)\n log('Making ddg query: ' + url)\n from calibre.scraper.simple import read_url\n br = br or browser()\n root = query(br, url, 'ddg', dump_raw, timeout=timeout, simple_scraper=partial(read_url, ddg_scraper_storage))\n ans = []\n for a in root.xpath('//*[@class=\"results\"]//*[@class=\"result__title\"]/a[@href and @class=\"result__a\"]'):\n try:\n ans.append(Result(ddg_href(a.get('href')), tostring(a), None))\n except KeyError:\n log('Failed to find ddg href in:', a.get('href'))\n return ans, url\n\n\ndef ddg_develop():\n br = browser()\n for result in ddg_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]:\n if '/dp/' in result.url:\n print(result.title)\n print(' ', result.url)\n print(' ', get_cached_url(result.url, br))\n print()\n# }}}\n\n# Bing {{{\n\n\ndef bing_term(t):\n t = t.replace('\"', '')\n if t in {'OR', 'AND', 'NOT'}:\n t = t.lower()\n return t\n\n\ndef bing_url_processor(url):\n return url\n\n\ndef bing_cached_url(url, br=None, log=prints, timeout=60):\n results, search_url = bing_search(['url:' + url], br=br, log=log, timeout=timeout)\n for result in results:\n return result.cached_url\n\n\ndef resolve_bing_wrapper_page(url, br, log):\n raw = br.open_novisit(url).read().decode('utf-8', 'replace')\n m = re.search(r'var u = \"(.+)\"', raw)\n if m is None:\n log('Failed to resolve bing wrapper page for url: ' + url)\n return url\n log('Resolved bing wrapped URL: ' + url + ' to ' + m.group(1))\n return m.group(1)\n\n\ndef bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60, show_user_agent=False):\n # http://vlaurie.com/computers2/Articles/bing_advanced_search.htm\n terms = [quote_term(bing_term(t)) for t in terms]\n if site is not None:\n terms.append(quote_term(('site:' + site)))\n q = '+'.join(terms)\n url = 'https://www.bing.com/search?q={q}'.format(q=q)\n log('Making bing query: ' + url)\n if br is None:\n br = browser()\n else:\n br = br.clone_browser()\n br.addheaders = [x for x in br.addheaders if x[0].lower() != 'user-agent']\n ua = ''\n from calibre.utils.random_ua import random_common_chrome_user_agent\n while not ua:\n ua = random_common_chrome_user_agent()\n if show_user_agent:\n print('User-agent:', ua)\n br.addheaders.append(('User-agent', ua))\n\n root = query(br, url, 'bing', dump_raw, timeout=timeout)\n ans = []\n for li in root.xpath('//*[@id=\"b_results\"]/li[@class=\"b_algo\"]'):\n a = li.xpath('descendant::h2/a[@href]') or li.xpath('descendant::div[@class=\"b_algoheader\"]/a[@href]')\n a = a[0]\n title = tostring(a)\n try:\n div = li.xpath('descendant::div[@class=\"b_attribution\" and @u]')[0]\n except IndexError:\n log('Ignoring {!r} as it has no cached page'.format(title))\n continue\n d, w = div.get('u').split('|')[-2:]\n cached_url = 'https://cc.bingj.com/cache.aspx?q={q}&d={d}&mkt=en-US&setlang=en-US&w={w}'.format(\n q=q, d=d, w=w)\n ans_url = a.get('href')\n if ans_url.startswith('https://www.bing.com/'):\n ans_url = resolve_bing_wrapper_page(ans_url, br, log)\n ans.append(Result(ans_url, title, cached_url))\n if not ans:\n title = ' '.join(root.xpath('//title/text()'))\n log('Failed to find any results on results page, with title:', title)\n return ans, url\n\n\ndef bing_develop(terms='heroes abercrombie'):\n if isinstance(terms, str):\n terms = terms.split()\n for result in bing_search(terms, 'www.amazon.com', dump_raw='/t/raw.html', show_user_agent=True)[0]:\n if '/dp/' in result.url:\n print(result.title)\n print(' ', result.url)\n print(' ', result.cached_url)\n print()\n# }}}\n\n# Google {{{\n\n\ndef google_term(t):\n t = t.replace('\"', '')\n if t in {'OR', 'AND', 'NOT'}:\n t = t.lower()\n return t\n\n\ndef google_url_processor(url):\n return url\n\n\ndef google_cache_url_for_url(url):\n if not isinstance(url, bytes):\n url = url.encode('utf-8')\n cu = quote(url, safe='')\n if isinstance(cu, bytes):\n cu = cu.decode('utf-8')\n return 'https://webcache.googleusercontent.com/search?q=cache:' + cu\n\n\ndef google_get_cached_url(url, br=None, log=prints, timeout=60):\n # Google's webcache was discontinued in september 2024\n cached_url = google_cache_url_for_url(url)\n br = google_specialize_browser(br or browser())\n try:\n raw = query(br, cached_url, 'google-cache', parser=lambda x: x.encode('utf-8'), timeout=timeout)\n except Exception as err:\n log('Failed to get cached URL from google for URL: {} with error: {}'.format(url, err))\n else:\n with webcache_lock:\n webcache[cached_url] = raw\n return cached_url\n\n\ndef canonicalize_url_for_cache_map(url):\n try:\n purl = urlparse(url)\n except Exception:\n return url\n if '.amazon.' in purl.netloc:\n url = url.split('&', 1)[0]\n return url\n\n\ndef google_parse_results(root, raw, log=prints, ignore_uncached=True):\n ans = []\n seen = set()\n for div in root.xpath('//*[@id=\"search\"]//*[@id=\"rso\"]//div[descendant::h3]'):\n try:\n a = div.xpath('descendant::a[@href]')[0]\n except IndexError:\n log('Ignoring div with no main result link')\n continue\n title = tostring(a)\n src_url = a.get('href')\n # print(f'{src_url=}')\n curl = canonicalize_url_for_cache_map(src_url)\n if curl in seen:\n continue\n seen.add(curl)\n ans.append(Result(curl, title, None))\n if not ans:\n title = ' '.join(root.xpath('//title/text()'))\n log('Failed to find any results on results page, with title:', title)\n return ans\n\n\ndef google_consent_cookies():\n # See https://github.com/benbusby/whoogle-search/pull/1054 for cookies\n from base64 import standard_b64encode\n from datetime import date\n base = {'domain': '.google.com', 'path': '/'}\n b = base.copy()\n b['name'], b['value'] = 'CONSENT', 'PENDING+987'\n yield b\n template = b'\\x08\\x01\\x128\\x08\\x14\\x12+boq_identityfrontenduiserver_20231107.05_p0\\x1a\\x05en-US \\x03\\x1a\\x06\\x08\\x80\\xf1\\xca\\xaa\\x06'\n template.replace(b'20231107', date.today().strftime('%Y%m%d').encode('ascii'))\n b = base.copy()\n b['name'], b['value'] = 'SOCS', standard_b64encode(template).decode('ascii').rstrip('=')\n yield b\n\n\ndef google_specialize_browser(br):\n with webcache_lock:\n if not hasattr(br, 'google_consent_cookie_added'):\n for c in google_consent_cookies():\n br.set_simple_cookie(c['name'], c['value'], c['domain'], path=c['path'])\n br.google_consent_cookie_added = True\n return br\n\n\ndef is_probably_book_asin(t):\n return t and len(t) == 10 and t.startswith('B') and t.upper() == t\n\n\ndef is_asin_or_isbn(t):\n from calibre.ebooks.metadata import check_isbn\n return bool(check_isbn(t) or is_probably_book_asin(t))\n\n\ndef google_format_query(terms, site=None, tbm=None):\n prevent_spelling_correction = False\n for t in terms:\n if is_asin_or_isbn(t):\n prevent_spelling_correction = True\n break\n terms = [quote_term(google_term(t)) for t in terms]\n if site is not None:\n terms.append(quote_term(('site:' + site)))\n q = '+'.join(terms)\n url = 'https://www.google.com/search?q={q}'.format(q=q)\n if tbm:\n url += '&tbm=' + tbm\n if prevent_spelling_correction:\n url += '&nfpr=1'\n return url\n\n\ndef google_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):\n url = google_format_query(terms, site)\n log('Making google query: ' + url)\n br = google_specialize_browser(br or browser())\n r = []\n root = query(br, url, 'google', dump_raw, timeout=timeout, save_raw=r.append)\n return google_parse_results(root, r[0], log=log), url\n\n\ndef google_develop(search_terms='1423146786', raw_from=''):\n if raw_from:\n with open(raw_from, 'rb') as f:\n raw = f.read()\n results = google_parse_results(parse_html(raw), raw)\n else:\n br = browser()\n results = google_search(search_terms.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]\n for result in results:\n if '/dp/' in result.url:\n print(result.title)\n print(' ', result.url)\n print(' ', result.cached_url)\n print()\n# }}}\n\n\ndef get_cached_url(url, br=None, log=prints, timeout=60):\n return bing_cached_url(url, br, log, timeout) or wayback_machine_cached_url(url, br, log, timeout)\n\n\ndef get_data_for_cached_url(url):\n with webcache_lock:\n return webcache.get(url)\n\n\ndef resolve_url(url):\n prefix, rest = url.partition(':')[::2]\n if prefix == 'bing':\n return bing_url_processor(rest)\n if prefix == 'wayback':\n return wayback_url_processor(rest)\n return url\n\n\n# if __name__ == '__main__':\n# import sys\n# func = sys.argv[-1]\n# globals()[func]()\n"
|
||
} |