summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarc Abonce Seguin <marc-abonce@mailbox.org>2020-07-26 23:27:16 -0700
committerMarc Abonce Seguin <marc-abonce@mailbox.org>2020-07-26 23:53:40 -0700
commit77b9faa8dfa1a856a63c91169adba3c710a394b5 (patch)
tree51d4a15b00c9535893202adb6675e730aa1aa4a8
parent6d18769ccf8e0e97479f80dc34547e8c28c61daa (diff)
downloadsearxng-77b9faa8dfa1a856a63c91169adba3c710a394b5.tar.gz
searxng-77b9faa8dfa1a856a63c91169adba3c710a394b5.zip
fix Wikipedia's paragraph extraction
-rw-r--r--searx/engines/wikipedia.py28
1 files changed, 1 insertions, 27 deletions
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
index a216ba886..7c9378dd8 100644
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -49,29 +49,6 @@ def request(query, params):
return params
-# get first meaningful paragraph
-# this should filter out disambiguation pages and notes above first paragraph
-# "magic numbers" were obtained by fine tuning
-def extract_first_paragraph(content, title, image):
- first_paragraph = None
-
- failed_attempts = 0
- for paragraph in content.split('\n'):
-
- starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
- length = len(paragraph)
-
- if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
- first_paragraph = paragraph
- break
-
- failed_attempts += 1
- if failed_attempts > 3:
- return None
-
- return first_paragraph
-
-
# get response from search-request
def response(resp):
results = []
@@ -97,10 +74,7 @@ def response(resp):
if image:
image = image.get('source')
- extract = page.get('extract')
-
- summary = extract_first_paragraph(extract, title, image)
- summary = summary.replace('() ', '')
+ summary = page.get('extract', '').split('\n')[0].replace('()', '')
# link to wikipedia article
wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \