diff options
author | Marc Abonce Seguin <marc-abonce@mailbox.org> | 2020-07-26 23:27:16 -0700 |
---|---|---|
committer | Marc Abonce Seguin <marc-abonce@mailbox.org> | 2020-07-26 23:53:40 -0700 |
commit | 77b9faa8dfa1a856a63c91169adba3c710a394b5 (patch) | |
tree | 51d4a15b00c9535893202adb6675e730aa1aa4a8 /searx/engines/wikipedia.py | |
parent | 6d18769ccf8e0e97479f80dc34547e8c28c61daa (diff) | |
download | searxng-77b9faa8dfa1a856a63c91169adba3c710a394b5.tar.gz searxng-77b9faa8dfa1a856a63c91169adba3c710a394b5.zip |
fix Wikipedia's paragraph extraction
Diffstat (limited to 'searx/engines/wikipedia.py')
-rw-r--r-- | searx/engines/wikipedia.py | 28 |
1 files changed, 1 insertions, 27 deletions
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index a216ba886..7c9378dd8 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -49,29 +49,6 @@ def request(query, params): return params -# get first meaningful paragraph -# this should filter out disambiguation pages and notes above first paragraph -# "magic numbers" were obtained by fine tuning -def extract_first_paragraph(content, title, image): - first_paragraph = None - - failed_attempts = 0 - for paragraph in content.split('\n'): - - starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35) - length = len(paragraph) - - if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)): - first_paragraph = paragraph - break - - failed_attempts += 1 - if failed_attempts > 3: - return None - - return first_paragraph - - # get response from search-request def response(resp): results = [] @@ -97,10 +74,7 @@ def response(resp): if image: image = image.get('source') - extract = page.get('extract') - - summary = extract_first_paragraph(extract, title, image) - summary = summary.replace('() ', '') + summary = page.get('extract', '').split('\n')[0].replace('()', '') # link to wikipedia article wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \ |