summaryrefslogtreecommitdiff
path: root/searx/engines/microsoft_academic.py
blob: 14de4ac9a020f1bf3d20f841a6362acf2909d7c8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
 Microsoft Academic (Science)
"""

from datetime import datetime
from json import loads
from uuid import uuid4
from urllib.parse import urlencode
from searx.utils import html_to_text

# about
about = {
    "website": 'https://academic.microsoft.com',
    "wikidata_id": 'Q28136779',
    "official_api_documentation": 'http://ma-graph.org/',
    "use_official_api": False,
    "require_api_key": False,
    "results": 'JSON',
}

categories = ['images']
paging = True
result_url = 'https://academic.microsoft.com/api/search/GetEntityResults?{query}'


def request(query, params):
    correlation_id = uuid4()
    msacademic = uuid4()
    time_now = datetime.now()

    params['url'] = result_url.format(query=urlencode({'correlationId': correlation_id}))
    params['cookies']['msacademic'] = str(msacademic)
    params['cookies']['ai_user'] = 'vhd0H|{now}'.format(now=str(time_now))
    params['method'] = 'POST'
    params['data'] = {
        'Query': '@{query}@'.format(query=query),
        'Limit': 10,
        'Offset': params['pageno'] - 1,
        'Filters': '',
        'OrderBy': '',
        'SortAscending': False,
    }

    return params


def response(resp):
    results = []
    response_data = loads(resp.text)
    if not response_data:
        return results

    for result in response_data['results']:
        url = _get_url(result)
        title = result['e']['dn']
        content = _get_content(result)
        results.append({
            'url': url,
            'title': html_to_text(title),
            'content': html_to_text(content),
        })

    return results


def _get_url(result):
    if 's' in result['e']:
        return result['e']['s'][0]['u']
    return 'https://academic.microsoft.com/#/detail/{pid}'.format(pid=result['id'])


def _get_content(result):
    if 'd' in result['e']:
        content = result['e']['d']
        if len(content) > 300:
            return content[:300] + '...'
        return content

    return ''