{
"amazon": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai\n# License: GPLv3 Copyright: 2011, Kovid Goyal
')\n else:\n parts.append(prepare_string_for_xml(x))\n raw = '
' + ''.join(parts) + '
'\n return raw\n\n\ndef get_details(browser, url, timeout): # {{{\n try:\n raw = browser.open_novisit(url, timeout=timeout).read()\n except Exception as e:\n gc = getattr(e, 'getcode', lambda: -1)\n if gc() != 403:\n raise\n # Google is throttling us, wait a little\n time.sleep(2)\n raw = browser.open_novisit(url, timeout=timeout).read()\n\n return raw\n\n\n# }}}\n\nxpath_cache = {}\n\n\ndef XPath(x):\n ans = xpath_cache.get(x)\n if ans is None:\n from lxml import etree\n ans = xpath_cache[x] = etree.XPath(x, namespaces=NAMESPACES)\n return ans\n\n\ndef to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{\n from lxml import etree\n\n # total_results = XPath('//openSearch:totalResults')\n # start_index = XPath('//openSearch:startIndex')\n # items_per_page = XPath('//openSearch:itemsPerPage')\n entry = XPath('//atom:entry')\n entry_id = XPath('descendant::atom:id')\n url = XPath('descendant::atom:link[@rel=\"self\"]/@href')\n creator = XPath('descendant::dc:creator')\n identifier = XPath('descendant::dc:identifier')\n title = XPath('descendant::dc:title')\n date = XPath('descendant::dc:date')\n publisher = XPath('descendant::dc:publisher')\n subject = XPath('descendant::dc:subject')\n description = XPath('descendant::dc:description')\n language = XPath('descendant::dc:language')\n\n # print(etree.tostring(entry_, pretty_print=True))\n\n def get_text(extra, x):\n try:\n ans = x(extra)\n if ans:\n ans = ans[0].text\n if ans and ans.strip():\n return ans.strip()\n except:\n log.exception('Programming error:')\n return None\n\n def get_extra_details():\n raw = get_details(browser, details_url, timeout)\n if running_a_test:\n with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f:\n f.write(raw)\n print('Book details saved to:', f.name, file=sys.stderr)\n feed = etree.fromstring(\n xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],\n parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)\n )\n return entry(feed)[0]\n\n if isinstance(entry_, str):\n google_id = entry_\n details_url = 'https://www.google.com/books/feeds/volumes/' + google_id\n extra = get_extra_details()\n title_ = ': '.join([x.text for x in title(extra)]).strip()\n authors = [x.text.strip() for x in creator(extra) if x.text]\n else:\n id_url = entry_id(entry_)[0].text\n google_id = id_url.split('/')[-1]\n details_url = url(entry_)[0]\n title_ = ': '.join([x.text for x in title(entry_)]).strip()\n authors = [x.text.strip() for x in creator(entry_) if x.text]\n if not id_url or not title:\n # Silently discard this entry\n return None\n extra = None\n\n if not authors:\n authors = [_('Unknown')]\n if not title:\n return None\n if extra is None:\n extra = get_extra_details()\n mi = Metadata(title_, authors)\n mi.identifiers = {'google': google_id}\n mi.comments = get_text(extra, description)\n lang = canonicalize_lang(get_text(extra, language))\n if lang:\n mi.language = lang\n mi.publisher = get_text(extra, publisher)\n\n # ISBN\n isbns = []\n for x in identifier(extra):\n t = type('')(x.text).strip()\n if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):\n if t[:5].upper() == 'ISBN:':\n t = check_isbn(t[5:])\n if t:\n isbns.append(t)\n if isbns:\n mi.isbn = sorted(isbns, key=len)[-1]\n mi.all_isbns = isbns\n\n # Tags\n try:\n btags = [x.text for x in subject(extra) if x.text]\n tags = []\n for t in btags:\n atags = [y.strip() for y in t.split('/')]\n for tag in atags:\n if tag not in tags:\n tags.append(tag)\n except:\n log.exception('Failed to parse tags:')\n tags = []\n if tags:\n mi.tags = [x.replace(',', ';') for x in tags]\n\n # pubdate\n pubdate = get_text(extra, date)\n if pubdate:\n from calibre.utils.date import parse_date, utcnow\n try:\n default = utcnow().replace(day=15)\n mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)\n except:\n log.error('Failed to parse pubdate %r' % pubdate)\n\n # Cover\n mi.has_google_cover = None\n for x in extra.xpath(\n '//*[@href and @rel=\"http://schemas.google.com/books/2008/thumbnail\"]'\n ):\n mi.has_google_cover = x.get('href')\n break\n\n return mi\n\n\n# }}}\n\n\nclass GoogleBooks(Source):\n\n name = 'Google'\n version = (1, 1, 0)\n minimum_calibre_version = (2, 80, 0)\n description = _('Downloads metadata and covers from Google Books')\n\n capabilities = frozenset({'identify'})\n touched_fields = frozenset({\n 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',\n 'identifier:isbn', 'identifier:google', 'languages'\n })\n supports_gzip_transfer_encoding = True\n cached_cover_url_is_reliable = False\n\n GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1'\n\n DUMMY_IMAGE_MD5 = frozenset(\n ('0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f')\n )\n\n def get_book_url(self, identifiers): # {{{\n goog = identifiers.get('google', None)\n if goog is not None:\n return ('google', goog, 'https://books.google.com/books?id=%s' % goog)\n # }}}\n\n def id_from_url(self, url): # {{{\n from polyglot.urllib import parse_qs, urlparse\n purl = urlparse(url)\n if purl.netloc == 'books.google.com':\n q = parse_qs(purl.query)\n gid = q.get('id')\n if gid:\n return 'google', gid[0]\n # }}}\n\n def create_query(self, title=None, authors=None, identifiers={}, capitalize_isbn=False): # {{{\n try:\n from urllib.parse import urlencode\n except ImportError:\n from urllib import urlencode\n BASE_URL = 'https://books.google.com/books/feeds/volumes?'\n isbn = check_isbn(identifiers.get('isbn', None))\n q = ''\n if isbn is not None:\n q += ('ISBN:' if capitalize_isbn else 'isbn:') + isbn\n elif title or authors:\n\n def build_term(prefix, parts):\n return ' '.join('in' + prefix + ':' + x for x in parts)\n\n title_tokens = list(self.get_title_tokens(title))\n if title_tokens:\n q += build_term('title', title_tokens)\n author_tokens = list(self.get_author_tokens(authors, only_first_author=True))\n if author_tokens:\n q += ('+' if q else '') + build_term('author', author_tokens)\n\n if not q:\n return None\n if not isinstance(q, bytes):\n q = q.encode('utf-8')\n return BASE_URL + urlencode({\n 'q': q,\n 'max-results': 20,\n 'start-index': 1,\n 'min-viewability': 'none',\n })\n\n # }}}\n\n def download_cover( # {{{\n self,\n log,\n result_queue,\n abort,\n title=None,\n authors=None,\n identifiers={},\n timeout=30,\n get_best_cover=False\n ):\n cached_url = self.get_cached_cover_url(identifiers)\n if cached_url is None:\n log.info('No cached cover found, running identify')\n rq = Queue()\n self.identify(\n log,\n rq,\n abort,\n title=title,\n authors=authors,\n identifiers=identifiers\n )\n if abort.is_set():\n return\n results = []\n while True:\n try:\n results.append(rq.get_nowait())\n except Empty:\n break\n results.sort(\n key=self.identify_results_keygen(\n title=title, authors=authors, identifiers=identifiers\n )\n )\n for mi in results:\n cached_url = self.get_cached_cover_url(mi.identifiers)\n if cached_url is not None:\n break\n if cached_url is None:\n log.info('No cover found')\n return\n\n br = self.browser\n for candidate in (0, 1):\n if abort.is_set():\n return\n url = cached_url + '&zoom={}'.format(candidate)\n log('Downloading cover from:', cached_url)\n try:\n cdata = br.open_novisit(url, timeout=timeout).read()\n if cdata:\n if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5:\n log.warning('Google returned a dummy image, ignoring')\n else:\n result_queue.put((self, cdata))\n break\n except Exception:\n log.exception('Failed to download cover from:', cached_url)\n\n # }}}\n\n def get_cached_cover_url(self, identifiers): # {{{\n url = None\n goog = identifiers.get('google', None)\n if goog is None:\n isbn = identifiers.get('isbn', None)\n if isbn is not None:\n goog = self.cached_isbn_to_identifier(isbn)\n if goog is not None:\n url = self.cached_identifier_to_cover_url(goog)\n\n return url\n\n # }}}\n\n def postprocess_downloaded_google_metadata(self, ans, relevance=0): # {{{\n if not isinstance(ans, Metadata):\n return ans\n ans.source_relevance = relevance\n goog = ans.identifiers['google']\n for isbn in getattr(ans, 'all_isbns', []):\n self.cache_isbn_to_identifier(isbn, goog)\n if getattr(ans, 'has_google_cover', False):\n self.cache_identifier_to_cover_url(goog, self.GOOGLE_COVER % goog)\n if ans.comments:\n ans.comments = pretty_google_books_comments(ans.comments)\n self.clean_downloaded_metadata(ans)\n return ans\n # }}}\n\n def get_all_details( # {{{\n self,\n br,\n log,\n entries,\n abort,\n result_queue,\n timeout\n ):\n from lxml import etree\n for relevance, i in enumerate(entries):\n try:\n ans = self.postprocess_downloaded_google_metadata(to_metadata(br, log, i, timeout, self.running_a_test), relevance)\n if isinstance(ans, Metadata):\n result_queue.put(ans)\n except Exception:\n log.exception(\n 'Failed to get metadata for identify entry:', etree.tostring(i)\n )\n if abort.is_set():\n break\n\n # }}}\n\n def identify_via_web_search( # {{{\n self,\n log,\n result_queue,\n abort,\n title=None,\n authors=None,\n identifiers={},\n timeout=30\n ):\n isbn = check_isbn(identifiers.get('isbn', None))\n q = []\n strip_punc_pat = regex.compile(r'[\\p{C}|\\p{M}|\\p{P}|\\p{S}|\\p{Z}]+', regex.UNICODE)\n google_ids = []\n check_tokens = set()\n has_google_id = 'google' in identifiers\n\n def to_check_tokens(*tokens):\n for t in tokens:\n if len(t) < 3:\n continue\n t = t.lower()\n if t in ('and', 'not', 'the'):\n continue\n yield strip_punc_pat.sub('', t)\n\n if has_google_id:\n google_ids.append(identifiers['google'])\n elif isbn is not None:\n q.append(isbn)\n elif title or authors:\n title_tokens = list(self.get_title_tokens(title))\n if title_tokens:\n q += title_tokens\n check_tokens |= set(to_check_tokens(*title_tokens))\n author_tokens = list(self.get_author_tokens(authors, only_first_author=True))\n if author_tokens:\n q += author_tokens\n check_tokens |= set(to_check_tokens(*author_tokens))\n if not q and not google_ids:\n return None\n from calibre.ebooks.metadata.sources.update import search_engines_module\n se = search_engines_module()\n br = se.google_specialize_browser(se.browser())\n if not has_google_id:\n url = se.google_format_query(q, tbm='bks')\n log('Making query:', url)\n r = []\n root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append)\n pat = re.compile(r'id=([^&]+)')\n for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False):\n m = pat.search(q.url)\n if m is None or not q.url.startswith('https://books.google'):\n continue\n google_ids.append(m.group(1))\n\n if not google_ids and isbn and (title or authors):\n return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)\n found = False\n seen = set()\n for relevance, gid in enumerate(google_ids):\n if gid in seen:\n continue\n seen.add(gid)\n try:\n ans = to_metadata(br, log, gid, timeout, self.running_a_test)\n if isinstance(ans, Metadata):\n if isbn:\n if isbn not in ans.all_isbns:\n log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the ISBN:', isbn,\n 'not in', ' '.join(ans.all_isbns))\n continue\n elif check_tokens:\n candidate = set(to_check_tokens(*self.get_title_tokens(ans.title)))\n candidate |= set(to_check_tokens(*self.get_author_tokens(ans.authors)))\n if candidate.intersection(check_tokens) != check_tokens:\n log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the query')\n continue\n ans = self.postprocess_downloaded_google_metadata(ans, relevance)\n result_queue.put(ans)\n found = True\n except:\n log.exception('Failed to get metadata for google books id:', gid)\n if abort.is_set():\n break\n if not found and isbn and (title or authors):\n return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)\n # }}}\n\n def identify( # {{{\n self,\n log,\n result_queue,\n abort,\n title=None,\n authors=None,\n identifiers={},\n timeout=30\n ):\n from lxml import etree\n entry = XPath('//atom:entry')\n identifiers = identifiers.copy()\n br = self.browser\n if 'google' in identifiers:\n try:\n ans = to_metadata(br, log, identifiers['google'], timeout, self.running_a_test)\n if isinstance(ans, Metadata):\n self.postprocess_downloaded_google_metadata(ans)\n result_queue.put(ans)\n return\n except Exception:\n log.exception('Failed to get metadata for Google identifier:', identifiers['google'])\n del identifiers['google']\n\n query = self.create_query(\n title=title, authors=authors, identifiers=identifiers\n )\n if not query:\n log.error('Insufficient metadata to construct query')\n return\n\n def make_query(query):\n log('Making query:', query)\n try:\n raw = br.open_novisit(query, timeout=timeout).read()\n except Exception as e:\n log.exception('Failed to make identify query: %r' % query)\n return False, as_unicode(e)\n\n try:\n feed = etree.fromstring(\n xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],\n parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)\n )\n return True, entry(feed)\n except Exception as e:\n log.exception('Failed to parse identify results')\n return False, as_unicode(e)\n ok, entries = make_query(query)\n if not ok:\n return entries\n if not entries and not abort.is_set():\n log('No results found, doing a web search instead')\n return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout)\n\n # There is no point running these queries in threads as google\n # throttles requests returning 403 Forbidden errors\n self.get_all_details(br, log, entries, abort, result_queue, timeout)\n\n # }}}\n\n\nif __name__ == '__main__': # tests {{{\n # To run these test use:\n # calibre-debug src/calibre/ebooks/metadata/sources/google.py\n from calibre.ebooks.metadata.sources.test import (\n authors_test, test_identify_plugin, title_test\n )\n tests = [\n ({\n 'identifiers': {'google': 's7NIrgEACAAJ'},\n }, [title_test('Ride Every Stride', exact=False)]),\n\n ({\n 'identifiers': {'isbn': '0743273567'},\n 'title': 'Great Gatsby',\n 'authors': ['Fitzgerald']\n }, [\n title_test('The great gatsby', exact=True),\n authors_test(['F. Scott Fitzgerald'])\n ]),\n\n ({\n 'title': 'Flatland',\n 'authors': ['Abbott']\n }, [title_test('Flatland', exact=False)]),\n\n ({\n 'title': 'The Blood Red Indian Summer: A Berger and Mitry Mystery',\n 'authors': ['David Handler'],\n }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')\n ]),\n\n ({\n # requires using web search to find the book\n 'title': 'Dragon Done It',\n 'authors': ['Eric Flint'],\n }, [\n title_test('The dragon done it', exact=True),\n authors_test(['Eric Flint', 'Mike Resnick'])\n ]),\n\n ]\n test_identify_plugin(GoogleBooks.name, tests[:])\n\n# }}}\n", "google_images": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__ = 'GPL v3'\n__copyright__ = '2013, Kovid Goyal