Science category: update the engines

* use the paper.html template * fetch more data from the engines * add crossref.py
author: Alexandre FLAMENT <alexandre.flament@hesge.ch> 2022-08-26 16:10:12 +0000
committer: Alexandre Flament <alex@al-f.net> 2022-09-23 20:45:58 +0200
commit: e36f85b8365e5d6a9263dd78242a10a305a9000c (patch)
tree: fce8f3e33d26847b004c20c378fb3fa35ca2c8eb /searx/engines/google_scholar.py
parent: 593026ad9cd024fd7b3182d48f274aa41b374c74 (diff)
download: searxng-e36f85b8365e5d6a9263dd78242a10a305a9000c.tar.gz
searxng-e36f85b8365e5d6a9263dd78242a10a305a9000c.zip
1 files changed, 73 insertions, 12 deletions
diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py
index 41c62886b..c07cd4cea 100644
--- a/searx/engines/google_scholar.py
+++ b/searx/engines/google_scholar.py
@@ -13,10 +13,12 @@ Definitions`_.
 
 from urllib.parse import urlencode
 from datetime import datetime
+from typing import Optional
 from lxml import html
 
 from searx.utils import (
     eval_xpath,
+    eval_xpath_getindex,
     eval_xpath_list,
     extract_text,
 )
@@ -46,7 +48,7 @@ about = {
 }
 
 # engine dependent config
-categories = ['science']
+categories = ['science', 'scientific publications']
 paging = True
 language_support = True
 use_locale_domain = True
@@ -99,7 +101,43 @@ def request(query, params):
     return params
 
 
-def response(resp):
+def parse_gs_a(text: Optional[str]):
+    """Parse the text written in green.
+
+    Possible formats:
+    * "{authors} - {journal}, {year} - {publisher}"
+    * "{authors} - {year} - {publisher}"
+    * "{authors} - {publisher}"
+    """
+    if text is None or text == "":
+        return None, None, None, None
+
+    s_text = text.split(' - ')
+    authors = s_text[0].split(', ')
+    publisher = s_text[-1]
+    if len(s_text) != 3:
+        return authors, None, publisher, None
+
+    # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
+    # get journal and year
+    journal_year = s_text[1].split(', ')
+    # journal is optional and may contains some coma
+    if len(journal_year) > 1:
+        journal = ', '.join(journal_year[0:-1])
+        if journal == '…':
+            journal = None
+    else:
+        journal = None
+    # year
+    year = journal_year[-1]
+    try:
+        publishedDate = datetime.strptime(year.strip(), '%Y')
+    except ValueError:
+        publishedDate = None
+    return authors, journal, publisher, publishedDate
+
+
+def response(resp):  # pylint: disable=too-many-locals
     """Get response from google's search request"""
     results = []
 
@@ -112,30 +150,53 @@ def response(resp):
     dom = html.fromstring(resp.text)
 
     # parse results
-    for result in eval_xpath_list(dom, '//div[@class="gs_ri"]'):
+    for result in eval_xpath_list(dom, '//div[@data-cid]'):
 
-        title = extract_text(eval_xpath(result, './h3[1]//a'))
+        title = extract_text(eval_xpath(result, './/h3[1]//a'))
 
         if not title:
             # this is a [ZITATION] block
             continue
 
-        url = eval_xpath(result, './h3[1]//a/@href')[0]
-        content = extract_text(eval_xpath(result, './div[@class="gs_rs"]')) or ''
-
-        pub_info = extract_text(eval_xpath(result, './div[@class="gs_a"]'))
-        if pub_info:
-            content += "[%s]" % pub_info
-
         pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]'))
         if pub_type:
-            title = title + " " + pub_type
+            pub_type = pub_type[1:-1].lower()
+
+        url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
+        content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
+        authors, journal, publisher, publishedDate = parse_gs_a(
+            extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
+        )
+        if publisher in url:
+            publisher = None
+
+        # cited by
+        comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
+
+        # link to the html or pdf document
+        html_url = None
+        pdf_url = None
+        doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
+        doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
+        if doc_type == "[PDF]":
+            pdf_url = doc_url
+        else:
+            html_url = doc_url
 
         results.append(
             {
+                'template': 'paper.html',
+                'type': pub_type,
                 'url': url,
                 'title': title,
+                'authors': authors,
+                'publisher': publisher,
+                'journal': journal,
+                'publishedDate': publishedDate,
                 'content': content,
+                'comments': comments,
+                'html_url': html_url,
+                'pdf_url': pdf_url,
             }
         )
author	Alexandre FLAMENT <alexandre.flament@hesge.ch>	2022-08-26 16:10:12 +0000
committer	Alexandre Flament <alex@al-f.net>	2022-09-23 20:45:58 +0200
commit	e36f85b8365e5d6a9263dd78242a10a305a9000c (patch)
tree	fce8f3e33d26847b004c20c378fb3fa35ca2c8eb /searx/engines/google_scholar.py
parent	593026ad9cd024fd7b3182d48f274aa41b374c74 (diff)
download	searxng-e36f85b8365e5d6a9263dd78242a10a305a9000c.tar.gz searxng-e36f85b8365e5d6a9263dd78242a10a305a9000c.zip