[feat] engine: add adobe stock video and audio engines

The engine has been revised; there is now the option ``adobe_content_types`` with which it is possible to configure engines for video and audio from the adobe stock. BTW this patch adds documentation to the engine. To test all three engines in one use a search term like:: !asi !asv !asa sound Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
author: Markus Heiser <markus.heiser@darmarit.de> 2024-11-22 19:25:23 +0100
committer: Bnyro <bnyro@tutanota.com> 2024-11-24 11:56:12 +0100
commit: 0253c10b522fe2e8206db78e37066771ce90cf92 (patch)
tree: 43bae14fc4b48397e5ba7c0fee27f6dadd9aa155
parent: f20a7632f1421eaea094b630a677f68e7a0808c7 (diff)
download: searxng-0253c10b522fe2e8206db78e37066771ce90cf92.tar.gz
searxng-0253c10b522fe2e8206db78e37066771ce90cf92.zip
4 files changed, 237 insertions, 38 deletions
diff --git a/docs/dev/engines/online/adobe_stock.rst b/docs/dev/engines/online/adobe_stock.rst
new file mode 100644
index 000000000..48a6511c0
--- /dev/null
+++ b/docs/dev/engines/online/adobe_stock.rst
@@ -0,0 +1,13 @@
+.. _adobe stock engine:
+
+===========
+Adobe Stock
+===========
+
+.. contents:: Contents
+   :depth: 2
+   :local:
+   :backlinks: entry
+
+.. automodule:: searx.engines.adobe_stock
+   :members:
diff --git a/requirements.txt b/requirements.txt
index 5225565e6..03babb7a8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,3 +19,4 @@ tomli==2.0.2; python_version < '3.11'
 msgspec==0.18.6
 eval_type_backport; python_version < '3.9'
 typer-slim==0.13.1
+isodate==0.7.2
diff --git a/searx/engines/adobe_stock.py b/searx/engines/adobe_stock.py
index f1b8e13ef..e6664d273 100644
--- a/searx/engines/adobe_stock.py
+++ b/searx/engines/adobe_stock.py
@@ -1,67 +1,229 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
-"""Adobe Stock (images)
+"""`Adobe Stock`_ is a service that gives access to millions of royalty-free
+assets. Assets types include photos, vectors, illustrations, templates, 3D
+assets, videos, motion graphics templates and audio tracks.
+
+.. Adobe Stock: https://stock.adobe.com/
+
+Configuration
+=============
+
+The engine has the following mandatory setting:
+
+- SearXNG's :ref:`engine categories`
+- Adobe-Stock's :py:obj:`adobe_order`
+- Adobe-Stock's :py:obj:`adobe_content_types`
+
+.. code:: yaml
+
+  - name: adobe stock
+    engine: adobe_stock
+    shortcut: asi
+    categories: [images]
+    adobe_order: relevance
+    adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"]
+
+  - name: adobe stock video
+    engine: adobe_stock
+    network: adobe stock
+    shortcut: asi
+    categories: [videos]
+    adobe_order: relevance
+    adobe_content_types: ["video"]
+
+Implementation
+==============
+
 """
+from __future__ import annotations
 
+from typing import TYPE_CHECKING
+from datetime import datetime, timedelta
 from urllib.parse import urlencode
-from searx.utils import gen_useragent
+
+import isodate
+
+if TYPE_CHECKING:
+    import logging
+
+    logger: logging.Logger
 
 about = {
-    "website": 'https://stock.adobe.com/',
-    "wikidata_id": 'Q5977430',
+    "website": "https://stock.adobe.com/",
+    "wikidata_id": "Q5977430",
     "official_api_documentation": None,
     "use_official_api": False,
     "require_api_key": False,
-    "results": 'JSON',
+    "results": "JSON",
 }
 
-categories = ['images']
+categories = []
 paging = True
+send_accept_language_header = True
+results_per_page = 10
 
-base_url = 'https://stock.adobe.com'
+base_url = "https://stock.adobe.com"
+
+adobe_order: str = ""
+"""Sort order, can be one of:
+
+- ``relevance`` or
+- ``featured`` or
+- ``creation`` (most recent) or
+- ``nb_downloads`` (number of downloads)
+"""
+
+ADOBE_VALID_TYPES = ["photo", "illustration", "zip_vector", "video", "template", "3d", "audio", "image"]
+adobe_content_types: list = []
+"""A list of of content types.  The following content types are offered:
+
+- Images: ``image``
+- Videos: ``video``
+- Templates: ``template``
+- 3D: ``3d``
+- Audio ``audio``
+
+Additional subcategories:
+
+- Photos: ``photo``
+- Illustrations: ``illustration``
+- Vectors: ``zip_vector`` (Vectors),
+"""
+
+# Do we need support for "free_collection" and "include_stock_enterprise"?
 
-results_per_page = 10
-adobe_order = "relevance"  # one of 'relevant', 'featured', 'creation' or 'nb_downloads'
+
+def init(_):
+    if not categories:
+        raise ValueError("adobe_stock engine: categories is unset")
+
+    # adobe_order
+    if not adobe_order:
+        raise ValueError("adobe_stock engine: adobe_order is unset")
+    if adobe_order not in ["relevance", "featured", "creation", "nb_downloads"]:
+        raise ValueError(f"unsupported adobe_order: {adobe_order}")
+
+    # adobe_content_types
+    if not adobe_content_types:
+        raise ValueError("adobe_stock engine: adobe_content_types is unset")
+
+    if isinstance(adobe_content_types, list):
+        for t in adobe_content_types:
+            if t not in ADOBE_VALID_TYPES:
+                raise ValueError("adobe_stock engine: adobe_content_types: '%s' is invalid" % t)
+    else:
+        raise ValueError(
+            "adobe_stock engine: adobe_content_types must be a list of strings not %s" % type(adobe_content_types)
+        )
 
 
 def request(query, params):
+
     args = {
-        'k': query,
-        'limit': results_per_page,
-        'order': adobe_order,
-        'search_page': params['pageno'],
-        'search_type': 'pagination',
-        'filters[content_type:video]': 0,
-        'filters[content_type:audio]': 0,
+        "k": query,
+        "limit": results_per_page,
+        "order": adobe_order,
+        "search_page": params["pageno"],
+        "search_type": "pagination",
     }
-    params['url'] = f"{base_url}/de/Ajax/Search?{urlencode(args)}"
+
+    for content_type in ADOBE_VALID_TYPES:
+        args[f"filters[content_type:{content_type}]"] = 1 if content_type in adobe_content_types else 0
+
+    params["url"] = f"{base_url}/de/Ajax/Search?{urlencode(args)}"
 
     # headers required to bypass bot-detection
-    params['headers'] = {
-        "User-Agent": gen_useragent(),
-        "Accept-Language": "en-US,en;q=0.5",
-    }
+    if params["searxng_locale"] == "all":
+        params["headers"]["Accept-Language"] = "en-US,en;q=0.5"
 
     return params
 
 
+def parse_image_item(item):
+    return {
+        "template": "images.html",
+        "url": item["content_url"],
+        "title": item["title"],
+        "content": item["asset_type"],
+        "img_src": item["content_thumb_extra_large_url"],
+        "thumbnail_src": item["thumbnail_url"],
+        "resolution": f"{item['content_original_width']}x{item['content_original_height']}",
+        "img_format": item["format"],
+        "author": item["author"],
+    }
+
+
+def parse_video_item(item):
+
+    # in video items, the title is more or less a "content description", we try
+    # to reduce the lenght of the title ..
+
+    title = item["title"]
+    content = ""
+    if "." in title.strip()[:-1]:
+        content = title
+        title = title.split(".", 1)[0]
+    elif "," in title:
+        content = title
+        title = title.split(",", 1)[0]
+    elif len(title) > 50:
+        content = title
+        title = ""
+        for w in content.split(" "):
+            title += f" {w}"
+            if len(title) > 50:
+                title = title.strip() + "\u2026"
+                break
+
+    return {
+        "template": "videos.html",
+        "url": item["content_url"],
+        "title": title,
+        "content": content,
+        # https://en.wikipedia.org/wiki/ISO_8601#Durations
+        "length": isodate.parse_duration(item["time_duration"]),
+        "publishedDate": datetime.strptime(item["creation_date"], "%Y-%m-%d"),
+        "thumbnail": item["thumbnail_url"],
+        "iframe_src": item["video_small_preview_url"],
+        "metadata": item["asset_type"],
+    }
+
+
+def parse_audio_item(item):
+    audio_data = item["audio_data"]
+    content = audio_data.get("description") or ""
+    if audio_data.get("album"):
+        content = audio_data["album"] + " - " + content
+
+    return {
+        "url": item["content_url"],
+        "title": item["title"],
+        "content": content,
+        # "thumbnail": base_url + item["thumbnail_url"],
+        "iframe_src": audio_data["preview"]["url"],
+        "publishedDate": datetime.fromisoformat(audio_data["release_date"]) if audio_data["release_date"] else None,
+        "length": timedelta(seconds=round(audio_data["duration"] / 1000)) if audio_data["duration"] else None,
+        "author": item.get("artist_name"),
+    }
+
+
 def response(resp):
     results = []
 
     json_resp = resp.json()
 
-    for item in json_resp['items'].values():
-        results.append(
-            {
-                'template': 'images.html',
-                'url': item['content_url'],
-                'title': item['title'],
-                'content': '',
-                'img_src': item['content_thumb_extra_large_url'],
-                'thumbnail_src': item['thumbnail_url'],
-                'resolution': f"{item['content_original_width']}x{item['content_original_height']}",
-                'img_format': item['format'],
-                'author': item['author'],
-            }
-        )
+    if isinstance(json_resp["items"], list):
+        return None
+    for item in json_resp["items"].values():
+        if item["asset_type"].lower() in ["image", "premium-image", "illustration", "vector"]:
+            result = parse_image_item(item)
+        elif item["asset_type"].lower() == "video":
+            result = parse_video_item(item)
+        elif item["asset_type"].lower() == "audio":
+            result = parse_audio_item(item)
+        else:
+            logger.error("no handle for %s --> %s", item["asset_type"], item)
+            continue
+        results.append(result)
 
     return results
diff --git a/searx/settings.yml b/searx/settings.yml
index 0084db902..d27172aef 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -327,9 +327,32 @@ engines:
 
   - name: adobe stock
     engine: adobe_stock
-    # available search orders: 'relevant', 'featured', 'creation', 'nb_downloads'
-    # adobe_order: relevance
-    shortcut: as
+    shortcut: asi
+    categories: ["images"]
+    # https://docs.searxng.org/dev/engines/online/adobe_stock.html
+    adobe_order: relevance
+    adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"]
+    timeout: 6
+    disabled: true
+
+  - name: adobe stock video
+    engine: adobe_stock
+    shortcut: asv
+    network: adobe stock
+    categories: ["videos"]
+    adobe_order: relevance
+    adobe_content_types: ["video"]
+    timeout: 6
+    disabled: true
+
+  - name: adobe stock audio
+    engine: adobe_stock
+    shortcut: asa
+    network: adobe stock
+    categories: ["music"]
+    adobe_order: relevance
+    adobe_content_types: ["audio"]
+    timeout: 6
     disabled: true
 
   - name: alpine linux packages
author	Markus Heiser <markus.heiser@darmarit.de>	2024-11-22 19:25:23 +0100
committer	Bnyro <bnyro@tutanota.com>	2024-11-24 11:56:12 +0100
commit	0253c10b522fe2e8206db78e37066771ce90cf92 (patch)
tree	43bae14fc4b48397e5ba7c0fee27f6dadd9aa155
parent	f20a7632f1421eaea094b630a677f68e7a0808c7 (diff)
download	searxng-0253c10b522fe2e8206db78e37066771ce90cf92.tar.gz searxng-0253c10b522fe2e8206db78e37066771ce90cf92.zip