searx/engines/torznab.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Torznab_ is an API specification that provides a standardized way to query
torrent site for content. It is used by a number of torrent applications,
including Prowlarr_ and Jackett_.

Using this engine together with Prowlarr_ or Jackett_ allows you to search
a huge number of torrent sites which are not directly supported.

Configuration
=============

The engine has the following settings:

``base_url``:
  Torznab endpoint URL.

``api_key``:
  The API key to use for authentication.

``torznab_categories``:
  The categories to use for searching. This is a list of category IDs.  See
  Prowlarr-categories_ or Jackett-categories_ for more information.

``show_torrent_files``:
  Whether to show the torrent file in the search results.  Be careful as using
  this with Prowlarr_ or Jackett_ leaks the API key.  This should be used only
  if you are querying a Torznab endpoint without authentication or if the
  instance is private.  Be aware that private trackers may ban you if you share
  the torrent file.  Defaults to ``false``.

``show_magnet_links``:
  Whether to show the magnet link in the search results.  Be aware that private
  trackers may ban you if you share the magnet link.  Defaults to ``true``.

.. _Torznab:
   https://torznab.github.io/spec-1.3-draft/index.html
.. _Prowlarr:
   https://github.com/Prowlarr/Prowlarr
.. _Jackett:
   https://github.com/Jackett/Jackett
.. _Prowlarr-categories:
   https://wiki.servarr.com/en/prowlarr/cardigann-yml-definition#categories
.. _Jackett-categories:
   https://github.com/Jackett/Jackett/wiki/Jackett-Categories

Implementations
===============

"""
from __future__ import annotations
from typing import TYPE_CHECKING

from typing import List, Dict, Any
from datetime import datetime
from urllib.parse import quote
from lxml import etree  # type: ignore

from searx.exceptions import SearxEngineAPIException
from searx.utils import humanize_bytes

if TYPE_CHECKING:
    import httpx
    import logging

    logger: logging.Logger

# engine settings
about: Dict[str, Any] = {
    "website": None,
    "wikidata_id": None,
    "official_api_documentation": "https://torznab.github.io/spec-1.3-draft",
    "use_official_api": True,
    "require_api_key": False,
    "results": 'XML',
}
categories: List[str] = ['files']
paging: bool = False
time_range_support: bool = False

# defined in settings.yml
# example (Jackett): "http://localhost:9117/api/v2.0/indexers/all/results/torznab"
base_url: str = ''
api_key: str = ''
# https://newznab.readthedocs.io/en/latest/misc/api/#predefined-categories
torznab_categories: List[str] = []
show_torrent_files: bool = False
show_magnet_links: bool = True


def init(engine_settings=None):  # pylint: disable=unused-argument
    """Initialize the engine."""
    if len(base_url) < 1:
        raise ValueError('missing torznab base_url')


def request(query: str, params: Dict[str, Any]) -> Dict[str, Any]:
    """Build the request params."""
    search_url: str = base_url + '?t=search&q={search_query}'

    if len(api_key) > 0:
        search_url += '&apikey={api_key}'
    if len(torznab_categories) > 0:
        search_url += '&cat={torznab_categories}'

    params['url'] = search_url.format(
        search_query=quote(query), api_key=api_key, torznab_categories=",".join([str(x) for x in torznab_categories])
    )

    return params


def response(resp: httpx.Response) -> List[Dict[str, Any]]:
    """Parse the XML response and return a list of results."""
    results = []
    search_results = etree.XML(resp.content)

    # handle errors:  https://newznab.readthedocs.io/en/latest/misc/api/#newznab-error-codes
    if search_results.tag == "error":
        raise SearxEngineAPIException(search_results.get("description"))

    channel: etree.Element = search_results[0]

    item: etree.Element
    for item in channel.iterfind('item'):
        result: Dict[str, Any] = build_result(item)
        results.append(result)

    return results


def build_result(item: etree.Element) -> Dict[str, Any]:
    """Build a result from a XML item."""

    # extract attributes from XML
    # see https://torznab.github.io/spec-1.3-draft/torznab/Specification-v1.3.html#predefined-attributes
    enclosure: etree.Element | None = item.find('enclosure')
    enclosure_url: str | None = None
    if enclosure is not None:
        enclosure_url = enclosure.get('url')

    filesize = get_attribute(item, 'size')
    if not filesize and enclosure:
        filesize = enclosure.get('length')

    guid = get_attribute(item, 'guid')
    comments = get_attribute(item, 'comments')
    pubDate = get_attribute(item, 'pubDate')
    seeders = get_torznab_attribute(item, 'seeders')
    leechers = get_torznab_attribute(item, 'leechers')
    peers = get_torznab_attribute(item, 'peers')

    # map attributes to searx result
    result: Dict[str, Any] = {
        'template': 'torrent.html',
        'title': get_attribute(item, 'title'),
        'filesize': humanize_bytes(int(filesize)) if filesize else None,
        'files': get_attribute(item, 'files'),
        'seed': seeders,
        'leech': _map_leechers(leechers, seeders, peers),
        'url': _map_result_url(guid, comments),
        'publishedDate': _map_published_date(pubDate),
        'torrentfile': None,
        'magnetlink': None,
    }

    link = get_attribute(item, 'link')
    if show_torrent_files:
        result['torrentfile'] = _map_torrent_file(link, enclosure_url)
    if show_magnet_links:
        magneturl = get_torznab_attribute(item, 'magneturl')
        result['magnetlink'] = _map_magnet_link(magneturl, guid, enclosure_url, link)
    return result


def _map_result_url(guid: str | None, comments: str | None) -> str | None:
    if guid and guid.startswith('http'):
        return guid
    if comments and comments.startswith('http'):
        return comments
    return None


def _map_leechers(leechers: str | None, seeders: str | None, peers: str | None) -> str | None:
    if leechers:
        return leechers
    if seeders and peers:
        return str(int(peers) - int(seeders))
    return None


def _map_published_date(pubDate: str | None) -> datetime | None:
    if pubDate is not None:
        try:
            return datetime.strptime(pubDate, '%a, %d %b %Y %H:%M:%S %z')
        except (ValueError, TypeError) as e:
            logger.debug("ignore exception (publishedDate): %s", e)
    return None


def _map_torrent_file(link: str | None, enclosure_url: str | None) -> str | None:
    if link and link.startswith('http'):
        return link
    if enclosure_url and enclosure_url.startswith('http'):
        return enclosure_url
    return None


def _map_magnet_link(
    magneturl: str | None,
    guid: str | None,
    enclosure_url: str | None,
    link: str | None,
) -> str | None:
    if magneturl and magneturl.startswith('magnet'):
        return magneturl
    if guid and guid.startswith('magnet'):
        return guid
    if enclosure_url and enclosure_url.startswith('magnet'):
        return enclosure_url
    if link and link.startswith('magnet'):
        return link
    return None


def get_attribute(item: etree.Element, property_name: str) -> str | None:
    """Get attribute from item."""
    property_element: etree.Element | None = item.find(property_name)
    if property_element is not None:
        return property_element.text
    return None


def get_torznab_attribute(item: etree.Element, attribute_name: str) -> str | None:
    """Get torznab special attribute from item."""
    element: etree.Element | None = item.find(
        './/torznab:attr[@name="{attribute_name}"]'.format(attribute_name=attribute_name),
        {'torznab': 'http://torznab.com/schemas/2015/feed'},
    )
    if element is not None:
        return element.get("value")
    return None