summaryrefslogtreecommitdiff
path: root/searx/favicons
diff options
context:
space:
mode:
Diffstat (limited to 'searx/favicons')
-rw-r--r--searx/favicons/__init__.py37
-rw-r--r--searx/favicons/__main__.py12
-rw-r--r--searx/favicons/cache.py476
-rw-r--r--searx/favicons/config.py62
-rw-r--r--searx/favicons/favicons.toml25
-rw-r--r--searx/favicons/proxy.py237
-rw-r--r--searx/favicons/resolvers.py100
7 files changed, 949 insertions, 0 deletions
diff --git a/searx/favicons/__init__.py b/searx/favicons/__init__.py
new file mode 100644
index 000000000..2a9893932
--- /dev/null
+++ b/searx/favicons/__init__.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Implementations for providing the favicons in SearXNG"""
+
+from __future__ import annotations
+
+__all__ = ["init", "favicon_url", "favicon_proxy"]
+
+import pathlib
+from searx import logger
+from searx import get_setting
+from .proxy import favicon_url, favicon_proxy
+
+logger = logger.getChild('favicons')
+
+
+def is_active():
+ return bool(get_setting("search.favicon_resolver", False))
+
+
+def init():
+
+ # pylint: disable=import-outside-toplevel
+
+ from . import config, cache, proxy
+
+ cfg_file = pathlib.Path("/etc/searxng/favicons.toml")
+ if not cfg_file.exists():
+ if is_active():
+ logger.error(f"missing favicon config: {cfg_file}")
+ cfg_file = config.DEFAULT_CFG_TOML
+
+ logger.debug(f"load favicon config: {cfg_file}")
+ cfg = config.FaviconConfig.from_toml_file(cfg_file, use_cache=True)
+ cache.init(cfg.cache)
+ proxy.init(cfg.proxy)
+
+ del cache, config, proxy, cfg
diff --git a/searx/favicons/__main__.py b/searx/favicons/__main__.py
new file mode 100644
index 000000000..c515edfea
--- /dev/null
+++ b/searx/favicons/__main__.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Command line implementation"""
+
+import typer
+
+from . import cache
+from . import init
+
+init()
+app = typer.Typer()
+app.add_typer(cache.app, name="cache", help="commands related to the cache")
+app()
diff --git a/searx/favicons/cache.py b/searx/favicons/cache.py
new file mode 100644
index 000000000..4b8276154
--- /dev/null
+++ b/searx/favicons/cache.py
@@ -0,0 +1,476 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Implementations for caching favicons.
+
+:py:obj:`FaviconCacheConfig`:
+ Configuration of the favicon cache
+
+:py:obj:`FaviconCache`:
+ Abstract base class for the implementation of a favicon cache.
+
+:py:obj:`FaviconCacheSQLite`:
+ Favicon cache that manages the favicon BLOBs in a SQLite DB.
+
+:py:obj:`FaviconCacheNull`:
+ Fallback solution if the configured cache cannot be used for system reasons.
+
+----
+
+"""
+
+from __future__ import annotations
+from typing import Literal
+
+import abc
+import dataclasses
+import hashlib
+import logging
+import pathlib
+import sqlite3
+import tempfile
+import time
+import typer
+
+from pydantic import BaseModel
+
+from searx import sqlitedb
+from searx import logger
+from searx.utils import humanize_bytes, humanize_number
+
+CACHE: "FaviconCache"
+FALLBACK_ICON = b"FALLBACK_ICON"
+
+logger = logger.getChild('favicons.cache')
+app = typer.Typer()
+
+
+@app.command()
+def state():
+ """show state of the cache"""
+ print(CACHE.state().report())
+
+
+@app.command()
+def maintenance(force: bool = True, debug: bool = False):
+ """perform maintenance of the cache"""
+ root_log = logging.getLogger()
+ if debug:
+ root_log.setLevel(logging.DEBUG)
+ else:
+ root_log.handlers = []
+ handler = logging.StreamHandler()
+ handler.setFormatter(logging.Formatter("%(message)s"))
+ logger.addHandler(handler)
+ logger.setLevel(logging.DEBUG)
+
+ state_t0 = CACHE.state()
+ CACHE.maintenance(force=force)
+ state_t1 = CACHE.state()
+ state_delta = state_t0 - state_t1
+ print("The cache has been reduced by:")
+ print(state_delta.report("\n- {descr}: {val}").lstrip("\n"))
+
+
+def init(cfg: "FaviconCacheConfig"):
+ """Initialization of a global ``CACHE``"""
+
+ global CACHE # pylint: disable=global-statement
+ if cfg.db_type == "sqlite":
+ if sqlite3.sqlite_version_info <= (3, 35):
+ logger.critical(
+ "Disable favicon caching completely: SQLite library (%s) is too old! (require >= 3.35)",
+ sqlite3.sqlite_version,
+ )
+ CACHE = FaviconCacheNull(cfg)
+ else:
+ CACHE = FaviconCacheSQLite(cfg)
+ elif cfg.db_type == "mem":
+ logger.error("Favicons are cached in memory, don't use this in production!")
+ CACHE = FaviconCacheMEM(cfg)
+ else:
+ raise NotImplementedError(f"favicons db_type '{cfg.db_type}' is unknown")
+
+
+class FaviconCacheConfig(BaseModel):
+ """Configuration of the favicon cache."""
+
+ db_type: Literal["sqlite", "mem"] = "sqlite"
+ """Type of the database:
+
+ ``sqlite``:
+ :py:obj:`.cache.FaviconCacheSQLite`
+
+ ``mem``:
+ :py:obj:`.cache.FaviconCacheMEM` (not recommended)
+ """
+
+ db_url: pathlib.Path = pathlib.Path(tempfile.gettempdir()) / "faviconcache.db"
+ """URL of the SQLite DB, the path to the database file."""
+
+ HOLD_TIME: int = 60 * 60 * 24 * 30 # 30 days
+ """Hold time (default in sec.), after which a BLOB is removed from the cache."""
+
+ LIMIT_TOTAL_BYTES: int = 1024 * 1024 * 50 # 50 MB
+ """Maximum of bytes (default) stored in the cache of all blobs. Note: The
+ limit is only reached at each maintenance interval after which the oldest
+ BLOBs are deleted; the limit is exceeded during the maintenance period. If
+ the maintenance period is *too long* or maintenance is switched off
+ completely, the cache grows uncontrollably."""
+
+ BLOB_MAX_BYTES: int = 1024 * 20 # 20 KB
+ """The maximum BLOB size in bytes that a favicon may have so that it can be
+ saved in the cache. If the favicon is larger, it is not saved in the cache
+ and must be requested by the client via the proxy."""
+
+ MAINTENANCE_PERIOD: int = 60 * 60
+ """Maintenance period in seconds / when :py:obj:`MAINTENANCE_MODE` is set to
+ ``auto``."""
+
+ MAINTENANCE_MODE: Literal["auto", "off"] = "auto"
+ """Type of maintenance mode
+
+ ``auto``:
+ Maintenance is carried out automatically as part of the maintenance
+ intervals (:py:obj:`MAINTENANCE_PERIOD`); no external process is required.
+
+ ``off``:
+ Maintenance is switched off and must be carried out by an external process
+ if required.
+ """
+
+
+@dataclasses.dataclass
+class FaviconCacheStats:
+ """Dataclass wich provides information on the status of the cache."""
+
+ favicons: int | None = None
+ bytes: int | None = None
+ domains: int | None = None
+ resolvers: int | None = None
+
+ field_descr = (
+ ("favicons", "number of favicons in cache", humanize_number),
+ ("bytes", "total size (approx. bytes) of cache", humanize_bytes),
+ ("domains", "total number of domains in cache", humanize_number),
+ ("resolvers", "number of resolvers", str),
+ )
+
+ def __sub__(self, other) -> FaviconCacheStats:
+ if not isinstance(other, self.__class__):
+ raise TypeError(f"unsupported operand type(s) for +: '{self.__class__}' and '{type(other)}'")
+ kwargs = {}
+ for field, _, _ in self.field_descr:
+ self_val, other_val = getattr(self, field), getattr(other, field)
+ if None in (self_val, other_val):
+ continue
+ if isinstance(self_val, int):
+ kwargs[field] = self_val - other_val
+ else:
+ kwargs[field] = self_val
+ return self.__class__(**kwargs)
+
+ def report(self, fmt: str = "{descr}: {val}\n"):
+ s = []
+ for field, descr, cast in self.field_descr:
+ val = getattr(self, field)
+ if val is None:
+ val = "--"
+ else:
+ val = cast(val)
+ s.append(fmt.format(descr=descr, val=val))
+ return "".join(s)
+
+
+class FaviconCache(abc.ABC):
+ """Abstract base class for the implementation of a favicon cache."""
+
+ @abc.abstractmethod
+ def __init__(self, cfg: FaviconCacheConfig):
+ """An instance of the favicon cache is build up from the configuration."""
+
+ @abc.abstractmethod
+ def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
+ """Returns ``None`` or the tuple of ``(data, mime)`` that has been
+ registered in the cache. The ``None`` indicates that there was no entry
+ in the cache."""
+
+ @abc.abstractmethod
+ def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
+ """Set data and mime-type in the cache. If data is None, the
+ :py:obj:`FALLBACK_ICON` is registered. in the cache."""
+
+ @abc.abstractmethod
+ def state(self) -> FaviconCacheStats:
+ """Returns a :py:obj:`FaviconCacheStats` (key/values) with information
+ on the state of the cache."""
+
+ @abc.abstractmethod
+ def maintenance(self, force=False):
+ """Performs maintenance on the cache"""
+
+
+class FaviconCacheNull(FaviconCache):
+ """A dummy favicon cache that caches nothing / a fallback solution. The
+ NullCache is used when more efficient caches such as the
+ :py:obj:`FaviconCacheSQLite` cannot be used because, for example, the SQLite
+ library is only available in an old version and does not meet the
+ requirements."""
+
+ def __init__(self, cfg: FaviconCacheConfig):
+ return None
+
+ def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
+ return None
+
+ def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
+ return False
+
+ def state(self):
+ return FaviconCacheStats(favicons=0)
+
+ def maintenance(self, force=False):
+ pass
+
+
+class FaviconCacheSQLite(sqlitedb.SQLiteAppl, FaviconCache):
+ """Favicon cache that manages the favicon BLOBs in a SQLite DB. The DB
+ model in the SQLite DB is implemented using the abstract class
+ :py:obj:`sqlitedb.SQLiteAppl`.
+
+ The following configurations are required / supported:
+
+ - :py:obj:`FaviconCacheConfig.db_url`
+ - :py:obj:`FaviconCacheConfig.HOLD_TIME`
+ - :py:obj:`FaviconCacheConfig.LIMIT_TOTAL_BYTES`
+ - :py:obj:`FaviconCacheConfig.BLOB_MAX_BYTES`
+ - :py:obj:`MAINTENANCE_PERIOD`
+ - :py:obj:`MAINTENANCE_MODE`
+ """
+
+ DB_SCHEMA = 1
+
+ DDL_BLOBS = """\
+CREATE TABLE IF NOT EXISTS blobs (
+ sha256 TEXT,
+ bytes_c INTEGER,
+ mime TEXT NOT NULL,
+ data BLOB NOT NULL,
+ PRIMARY KEY (sha256))"""
+
+ """Table to store BLOB objects by their sha256 hash values."""
+
+ DDL_BLOB_MAP = """\
+CREATE TABLE IF NOT EXISTS blob_map (
+ m_time INTEGER DEFAULT (strftime('%s', 'now')), -- last modified (unix epoch) time in sec.
+ sha256 TEXT,
+ resolver TEXT,
+ authority TEXT,
+ PRIMARY KEY (resolver, authority))"""
+
+ """Table to map from (resolver, authority) to sha256 hash values."""
+
+ DDL_CREATE_TABLES = {
+ "blobs": DDL_BLOBS,
+ "blob_map": DDL_BLOB_MAP,
+ }
+
+ SQL_DROP_LEFTOVER_BLOBS = (
+ "DELETE FROM blobs WHERE sha256 IN ("
+ " SELECT b.sha256"
+ " FROM blobs b"
+ " LEFT JOIN blob_map bm"
+ " ON b.sha256 = bm.sha256"
+ " WHERE bm.sha256 IS NULL)"
+ )
+ """Delete blobs.sha256 (BLOBs) no longer in blob_map.sha256."""
+
+ SQL_ITER_BLOBS_SHA256_BYTES_C = (
+ "SELECT b.sha256, b.bytes_c FROM blobs b"
+ " JOIN blob_map bm "
+ " ON b.sha256 = bm.sha256"
+ " ORDER BY bm.m_time ASC"
+ )
+
+ SQL_INSERT_BLOBS = (
+ "INSERT INTO blobs (sha256, bytes_c, mime, data) VALUES (?, ?, ?, ?)"
+ " ON CONFLICT (sha256) DO NOTHING"
+ ) # fmt: skip
+
+ SQL_INSERT_BLOB_MAP = (
+ "INSERT INTO blob_map (sha256, resolver, authority) VALUES (?, ?, ?)"
+ " ON CONFLICT DO UPDATE "
+ " SET sha256=excluded.sha256, m_time=strftime('%s', 'now')"
+ )
+
+ def __init__(self, cfg: FaviconCacheConfig):
+ """An instance of the favicon cache is build up from the configuration.""" #
+
+ if cfg.db_url == ":memory:":
+ logger.critical("don't use SQLite DB in :memory: in production!!")
+ super().__init__(cfg.db_url)
+ self.cfg = cfg
+
+ def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
+
+ sql = "SELECT sha256 FROM blob_map WHERE resolver = ? AND authority = ?"
+ res = self.DB.execute(sql, (resolver, authority)).fetchone()
+ if res is None:
+ return None
+
+ data, mime = (None, None)
+ sha256 = res[0]
+ if sha256 == FALLBACK_ICON:
+ return data, mime
+
+ sql = "SELECT data, mime FROM blobs WHERE sha256 = ?"
+ res = self.DB.execute(sql, (sha256,)).fetchone()
+ if res is not None:
+ data, mime = res
+ return data, mime
+
+ def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
+
+ if self.cfg.MAINTENANCE_MODE == "auto" and int(time.time()) > self.next_maintenance_time:
+ # Should automatic maintenance be moved to a new thread?
+ self.maintenance()
+
+ if data is not None and mime is None:
+ logger.error(
+ "favicon resolver %s tries to cache mime-type None for authority %s",
+ resolver,
+ authority,
+ )
+ return False
+
+ bytes_c = len(data or b"")
+ if bytes_c > self.cfg.BLOB_MAX_BYTES:
+ logger.info(
+ "favicon of resolver: %s / authority: %s to big to cache (bytes: %s) " % (resolver, authority, bytes_c)
+ )
+ return False
+
+ if data is None:
+ sha256 = FALLBACK_ICON
+ else:
+ sha256 = hashlib.sha256(data).hexdigest()
+
+ with self.connect() as conn:
+ if sha256 != FALLBACK_ICON:
+ conn.execute(self.SQL_INSERT_BLOBS, (sha256, bytes_c, mime, data))
+ conn.execute(self.SQL_INSERT_BLOB_MAP, (sha256, resolver, authority))
+
+ return True
+
+ @property
+ def next_maintenance_time(self) -> int:
+ """Returns (unix epoch) time of the next maintenance."""
+
+ return self.cfg.MAINTENANCE_PERIOD + self.properties.m_time("LAST_MAINTENANCE")
+
+ def maintenance(self, force=False):
+
+ # Prevent parallel DB maintenance cycles from other DB connections
+ # (e.g. in multi thread or process environments).
+
+ if not force and int(time.time()) < self.next_maintenance_time:
+ logger.debug("no maintenance required yet, next maintenance interval is in the future")
+ return
+ self.properties.set("LAST_MAINTENANCE", "") # hint: this (also) sets the m_time of the property!
+
+ # do maintenance tasks
+
+ with self.connect() as conn:
+
+ # drop items not in HOLD time
+ res = conn.execute(
+ f"DELETE FROM blob_map"
+ f" WHERE cast(m_time as integer) < cast(strftime('%s', 'now') as integer) - {self.cfg.HOLD_TIME}"
+ )
+ logger.debug("dropped %s obsolete blob_map items from db", res.rowcount)
+ res = conn.execute(self.SQL_DROP_LEFTOVER_BLOBS)
+ logger.debug("dropped %s obsolete BLOBS from db", res.rowcount)
+
+ # drop old items to be in LIMIT_TOTAL_BYTES
+ total_bytes = conn.execute("SELECT SUM(bytes_c) FROM blobs").fetchone()[0] or 0
+ if total_bytes > self.cfg.LIMIT_TOTAL_BYTES:
+
+ x = total_bytes - self.cfg.LIMIT_TOTAL_BYTES
+ c = 0
+ sha_list = []
+ for row in conn.execute(self.SQL_ITER_BLOBS_SHA256_BYTES_C):
+ sha256, bytes_c = row
+ sha_list.append(sha256)
+ c += bytes_c
+ if c > x:
+ break
+ if sha_list:
+ conn.execute("DELETE FROM blobs WHERE sha256 IN ('%s')" % "','".join(sha_list))
+ conn.execute("DELETE FROM blob_map WHERE sha256 IN ('%s')" % "','".join(sha_list))
+ logger.debug("dropped %s blobs with total size of %s bytes", len(sha_list), c)
+
+ def _query_val(self, sql, default=None):
+ val = self.DB.execute(sql).fetchone()
+ if val is not None:
+ val = val[0]
+ if val is None:
+ val = default
+ return val
+
+ def state(self) -> FaviconCacheStats:
+ return FaviconCacheStats(
+ favicons=self._query_val("SELECT count(*) FROM blobs", 0),
+ bytes=self._query_val("SELECT SUM(bytes_c) FROM blobs", 0),
+ domains=self._query_val("SELECT count(*) FROM (SELECT authority FROM blob_map GROUP BY authority)", 0),
+ resolvers=self._query_val("SELECT count(*) FROM (SELECT resolver FROM blob_map GROUP BY resolver)", 0),
+ )
+
+
+class FaviconCacheMEM(FaviconCache):
+ """Favicon cache in process' memory. Its just a POC that stores the
+ favicons in the memory of the process.
+
+ .. attention::
+
+ Don't use it in production, it will blow up your memory!!
+
+ """
+
+ def __init__(self, cfg):
+
+ self.cfg = cfg
+ self._data = {}
+ self._sha_mime = {}
+
+ def __call__(self, resolver: str, authority: str) -> None | tuple[bytes | None, str | None]:
+
+ sha, mime = self._sha_mime.get(f"{resolver}:{authority}", (None, None))
+ if sha is None:
+ return None
+ data = self._data.get(sha)
+ if data == FALLBACK_ICON:
+ data = None
+ return data, mime
+
+ def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
+
+ if data is None:
+ data = FALLBACK_ICON
+ mime = None
+
+ elif mime is None:
+ logger.error(
+ "favicon resolver %s tries to cache mime-type None for authority %s",
+ resolver,
+ authority,
+ )
+ return False
+
+ digest = hashlib.sha256(data).hexdigest()
+ self._data[digest] = data
+ self._sha_mime[f"{resolver}:{authority}"] = (digest, mime)
+ return True
+
+ def state(self):
+ return FaviconCacheStats(favicons=len(self._data.keys()))
+
+ def maintenance(self, force=False):
+ pass
diff --git a/searx/favicons/config.py b/searx/favicons/config.py
new file mode 100644
index 000000000..1c18b1631
--- /dev/null
+++ b/searx/favicons/config.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# pylint: disable=missing-module-docstring
+
+from __future__ import annotations
+
+import pathlib
+from pydantic import BaseModel
+
+from searx.compat import tomllib
+from .cache import FaviconCacheConfig
+from .proxy import FaviconProxyConfig
+
+CONFIG_SCHEMA: int = 1
+"""Version of the configuration schema."""
+
+TOML_CACHE: dict[str, "FaviconConfig"] = {}
+"""Cache config objects by TOML's filename."""
+
+DEFAULT_CFG_TOML = pathlib.Path(__file__).parent / "favicons.toml"
+
+
+class FaviconConfig(BaseModel):
+ """The class aggregates configurations of the favicon tools"""
+
+ cfg_schema: int
+ """Config's schema version. The specification of the version of the schema
+ is mandatory, currently only version :py:obj:`CONFIG_SCHEMA` is supported.
+ By specifying a version, it is possible to ensure downward compatibility in
+ the event of future changes to the configuration schema"""
+
+ cache: FaviconCacheConfig = FaviconCacheConfig()
+ """Setup of the :py:obj:`.cache.FaviconCacheConfig`."""
+
+ proxy: FaviconProxyConfig = FaviconProxyConfig()
+ """Setup of the :py:obj:`.proxy.FaviconProxyConfig`."""
+
+ @classmethod
+ def from_toml_file(cls, cfg_file: pathlib.Path, use_cache: bool) -> "FaviconConfig":
+ """Create a config object from a TOML file, the ``use_cache`` argument
+ specifies whether a cache should be used.
+ """
+
+ cached = TOML_CACHE.get(str(cfg_file))
+ if use_cache and cached:
+ return cached
+
+ with cfg_file.open("rb") as f:
+
+ cfg = tomllib.load(f)
+ cfg = cfg.get("favicons", cfg)
+
+ schema = cfg.get("cfg_schema")
+ if schema != CONFIG_SCHEMA:
+ raise ValueError(
+ f"config schema version {CONFIG_SCHEMA} is needed, version {schema} is given in {cfg_file}"
+ )
+
+ cfg = cls(**cfg)
+ if use_cache and cached:
+ TOML_CACHE[str(cfg_file.resolve())] = cfg
+
+ return cfg
diff --git a/searx/favicons/favicons.toml b/searx/favicons/favicons.toml
new file mode 100644
index 000000000..0e433d3aa
--- /dev/null
+++ b/searx/favicons/favicons.toml
@@ -0,0 +1,25 @@
+[favicons]
+
+cfg_schema = 1 # config's schema version no.
+
+[favicons.proxy]
+
+# max_age = 5184000 # 60 days / default: 7 days (604800 sec)
+
+# [favicons.proxy.resolver_map]
+#
+# The available favicon resolvers are registered here.
+#
+# "duckduckgo" = "searx.favicons.resolvers.duckduckgo"
+# "allesedv" = "searx.favicons.resolvers.allesedv"
+# "google" = "searx.favicons.resolvers.google"
+# "yandex" = "searx.favicons.resolvers.yandex"
+
+[favicons.cache]
+
+# db_url = "/var/cache/searxng/faviconcache.db" # default: "/tmp/faviconcache.db"
+# HOLD_TIME = 5184000 # 60 days / default: 30 days
+# LIMIT_TOTAL_BYTES = 2147483648 # 2 GB / default: 50 MB
+# BLOB_MAX_BYTES = 40960 # 40 KB / default 20 KB
+# MAINTENANCE_MODE = "off" # default: "auto"
+# MAINTENANCE_PERIOD = 600 # 10min / default: 1h \ No newline at end of file
diff --git a/searx/favicons/proxy.py b/searx/favicons/proxy.py
new file mode 100644
index 000000000..8cefe6c59
--- /dev/null
+++ b/searx/favicons/proxy.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Implementations for a favicon proxy"""
+
+from __future__ import annotations
+
+from typing import Callable
+
+import importlib
+import base64
+import pathlib
+import urllib.parse
+
+import flask
+from httpx import HTTPError
+from pydantic import BaseModel
+
+from searx import get_setting
+
+from searx.webutils import new_hmac, is_hmac_of
+from searx.exceptions import SearxEngineResponseException
+
+from .resolvers import DEFAULT_RESOLVER_MAP
+from . import cache
+
+DEFAULT_FAVICON_URL = {}
+CFG: FaviconProxyConfig = None # type: ignore
+
+
+def init(cfg: FaviconProxyConfig):
+ global CFG # pylint: disable=global-statement
+ CFG = cfg
+
+
+def _initial_resolver_map():
+ d = {}
+ name: str = get_setting("search.favicon_resolver", None) # type: ignore
+ if name:
+ func = DEFAULT_RESOLVER_MAP.get(name)
+ if func:
+ d = {name: f"searx.favicons.resolvers.{func.__name__}"}
+ return d
+
+
+class FaviconProxyConfig(BaseModel):
+ """Configuration of the favicon proxy."""
+
+ max_age: int = 60 * 60 * 24 * 7 # seven days
+ """HTTP header Cache-Control_ ``max-age``
+
+ .. _Cache-Control: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Cache-Control
+ """
+
+ secret_key: str = get_setting("server.secret_key") # type: ignore
+ """By default, the value from :ref:`server.secret_key <settings server>`
+ setting is used."""
+
+ resolver_timeout: int = get_setting("outgoing.request_timeout") # type: ignore
+ """Timeout which the resolvers should not exceed, is usually passed to the
+ outgoing request of the resolver. By default, the value from
+ :ref:`outgoing.request_timeout <settings outgoing>` setting is used."""
+
+ resolver_map: dict[str, str] = _initial_resolver_map()
+ """The resolver_map is a key / value dictionary where the key is the name of
+ the resolver and the value is the fully qualifying name (fqn) of resolver's
+ function (the callable). The resolvers from the python module
+ :py:obj:`searx.favicons.resolver` are available by default."""
+
+ def get_resolver(self, name: str) -> Callable | None:
+ """Returns the callable object (function) of the resolver with the
+ ``name``. If no resolver is registered for the ``name``, ``None`` is
+ returned.
+ """
+ fqn = self.resolver_map.get(name)
+ if fqn is None:
+ return None
+ mod_name, _, func_name = fqn.rpartition('.')
+ mod = importlib.import_module(mod_name)
+ func = getattr(mod, func_name)
+ if func is None:
+ raise ValueError(f"resolver {fqn} is not implemented")
+ return func
+
+ favicon_path: str = get_setting("ui.static_path") + "/themes/{theme}/img/empty_favicon.svg" # type: ignore
+ favicon_mime_type: str = "image/svg+xml"
+
+ def favicon(self, **replacements):
+ """Returns pathname and mimetype of the default favicon."""
+ return (
+ pathlib.Path(self.favicon_path.format(**replacements)),
+ self.favicon_mime_type,
+ )
+
+ def favicon_data_url(self, **replacements):
+ """Returns data image URL of the default favicon."""
+
+ cache_key = ", ".join(f"{x}:{replacements[x]}" for x in sorted(list(replacements.keys()), key=str))
+ data_url = DEFAULT_FAVICON_URL.get(cache_key)
+ if data_url is not None:
+ return data_url
+
+ fav, mimetype = CFG.favicon(**replacements)
+ # hint: encoding utf-8 limits favicons to be a SVG image
+ with fav.open("r", encoding="utf-8") as f:
+ data_url = f.read()
+
+ data_url = urllib.parse.quote(data_url)
+ data_url = f"data:{mimetype};utf8,{data_url}"
+ DEFAULT_FAVICON_URL[cache_key] = data_url
+ return data_url
+
+
+def favicon_proxy():
+ """REST API of SearXNG's favicon proxy service
+
+ ::
+
+ /favicon_proxy?authority=<...>&h=<...>
+
+ ``authority``:
+ Domain name :rfc:`3986` / see :py:obj:`favicon_url`
+
+ ``h``:
+ HMAC :rfc:`2104`, build up from the :ref:`server.secret_key <settings
+ server>` setting.
+
+ """
+ authority = flask.request.args.get('authority')
+
+ # malformed request or RFC 3986 authority
+ if not authority or "/" in authority:
+ return '', 400
+
+ # malformed request / does not have authorisation
+ if not is_hmac_of(
+ CFG.secret_key,
+ authority.encode(),
+ flask.request.args.get('h', ''),
+ ):
+ return '', 400
+
+ resolver = flask.request.preferences.get_value('favicon_resolver') # type: ignore
+ # if resolver is empty or not valid, just return HTTP 400.
+ if not resolver or resolver not in CFG.resolver_map.keys():
+ return "", 400
+
+ data, mime = search_favicon(resolver, authority)
+
+ if data is not None and mime is not None:
+ resp = flask.Response(data, mimetype=mime) # type: ignore
+ resp.headers['Cache-Control'] = f"max-age={CFG.max_age}"
+ return resp
+
+ # return default favicon from static path
+ theme = flask.request.preferences.get_value("theme") # type: ignore
+ fav, mimetype = CFG.favicon(theme=theme)
+ return flask.send_from_directory(fav.parent, fav.name, mimetype=mimetype)
+
+
+def search_favicon(resolver: str, authority: str) -> tuple[None | bytes, None | str]:
+ """Sends the request to the favicon resolver and returns a tuple for the
+ favicon. The tuple consists of ``(data, mime)``, if the resolver has not
+ determined a favicon, both values are ``None``.
+
+ ``data``:
+ Binary data of the favicon.
+
+ ``mime``:
+ Mime type of the favicon.
+
+ """
+
+ data, mime = (None, None)
+
+ func = CFG.get_resolver(resolver)
+ if func is None:
+ return data, mime
+
+ # to avoid superfluous requests to the resolver, first look in the cache
+ data_mime = cache.CACHE(resolver, authority)
+ if data_mime is not None:
+ return data_mime
+
+ try:
+ data, mime = func(authority, timeout=CFG.resolver_timeout)
+ if data is None or mime is None:
+ data, mime = (None, None)
+
+ except (HTTPError, SearxEngineResponseException):
+ pass
+
+ cache.CACHE.set(resolver, authority, mime, data)
+ return data, mime
+
+
+def favicon_url(authority: str) -> str:
+ """Function to generate the image URL used for favicons in SearXNG's result
+ lists. The ``authority`` argument (aka netloc / :rfc:`3986`) is usually a
+ (sub-) domain name. This function is used in the HTML (jinja) templates.
+
+ .. code:: html
+
+ <div class="favicon">
+ <img src="{{ favicon_url(result.parsed_url.netloc) }}">
+ </div>
+
+ The returned URL is a route to :py:obj:`favicon_proxy` REST API.
+
+ If the favicon is already in the cache, the returned URL is a `data URL`_
+ (something like ``data:image/png;base64,...``). By generating a data url from
+ the :py:obj:`.cache.FaviconCache`, additional HTTP roundtripps via the
+ :py:obj:`favicon_proxy` are saved. However, it must also be borne in mind
+ that data urls are not cached in the client (web browser).
+
+ .. _data URL: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs
+
+ """
+
+ resolver = flask.request.preferences.get_value('favicon_resolver') # type: ignore
+ # if resolver is empty or not valid, just return nothing.
+ if not resolver or resolver not in CFG.resolver_map.keys():
+ return ""
+
+ data_mime = cache.CACHE(resolver, authority)
+
+ if data_mime == (None, None):
+ # we have already checked, the resolver does not have a favicon
+ theme = flask.request.preferences.get_value("theme") # type: ignore
+ return CFG.favicon_data_url(theme=theme)
+
+ if data_mime is not None:
+ data, mime = data_mime
+ return f"data:{mime};base64,{str(base64.b64encode(data), 'utf-8')}" # type: ignore
+
+ h = new_hmac(CFG.secret_key, authority.encode())
+ proxy_url = flask.url_for('favicon_proxy')
+ query = urllib.parse.urlencode({"authority": authority, "h": h})
+ return f"{proxy_url}?{query}"
diff --git a/searx/favicons/resolvers.py b/searx/favicons/resolvers.py
new file mode 100644
index 000000000..bde5ae2b8
--- /dev/null
+++ b/searx/favicons/resolvers.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Implementations of the favicon *resolvers* that are available in the favicon
+proxy by default. A *resolver* is a function that obtains the favicon from an
+external source. The *resolver* function receives two arguments (``domain,
+timeout``) and returns a tuple ``(data, mime)``.
+
+"""
+
+from __future__ import annotations
+
+__all__ = ["DEFAULT_RESOLVER_MAP", "allesedv", "duckduckgo", "google", "yandex"]
+
+from typing import Callable
+from searx import network
+from searx import logger
+
+DEFAULT_RESOLVER_MAP: dict[str, Callable]
+logger = logger.getChild('favicons.resolvers')
+
+
+def _req_args(**kwargs):
+ # add the request arguments from the searx.network
+ d = {"raise_for_httperror": False}
+ d.update(kwargs)
+ return d
+
+
+def allesedv(domain: str, timeout: int) -> tuple[None | bytes, None | str]:
+ """Favicon Resolver from allesedv.com / https://favicon.allesedv.com/"""
+ data, mime = (None, None)
+ url = f"https://f1.allesedv.com/32/{domain}"
+ logger.debug("fetch favicon from: %s", url)
+
+ # will just return a 200 regardless of the favicon existing or not
+ # sometimes will be correct size, sometimes not
+ response = network.get(url, **_req_args(timeout=timeout))
+ if response and response.status_code == 200:
+ mime = response.headers['Content-Type']
+ if mime != 'image/gif':
+ data = response.content
+ return data, mime
+
+
+def duckduckgo(domain: str, timeout: int) -> tuple[None | bytes, None | str]:
+ """Favicon Resolver from duckduckgo.com / https://blog.jim-nielsen.com/2021/displaying-favicons-for-any-domain/"""
+ data, mime = (None, None)
+ url = f"https://icons.duckduckgo.com/ip2/{domain}.ico"
+ logger.debug("fetch favicon from: %s", url)
+
+ # will return a 404 if the favicon does not exist and a 200 if it does,
+ response = network.get(url, **_req_args(timeout=timeout))
+ if response and response.status_code == 200:
+ # api will respond with a 32x32 png image
+ mime = response.headers['Content-Type']
+ data = response.content
+ return data, mime
+
+
+def google(domain: str, timeout: int) -> tuple[None | bytes, None | str]:
+ """Favicon Resolver from google.com"""
+ data, mime = (None, None)
+
+ # URL https://www.google.com/s2/favicons?sz=32&domain={domain}" will be
+ # redirected (HTTP 301 Moved Permanently) to t1.gstatic.com/faviconV2:
+ url = (
+ f"https://t1.gstatic.com/faviconV2?client=SOCIAL&type=FAVICON&fallback_opts=TYPE,SIZE,URL"
+ f"&url=https://{domain}&size=32"
+ )
+ logger.debug("fetch favicon from: %s", url)
+
+ # will return a 404 if the favicon does not exist and a 200 if it does,
+ response = network.get(url, **_req_args(timeout=timeout))
+ if response and response.status_code == 200:
+ # api will respond with a 32x32 png image
+ mime = response.headers['Content-Type']
+ data = response.content
+ return data, mime
+
+
+def yandex(domain: str, timeout: int) -> tuple[None | bytes, None | str]:
+ """Favicon Resolver from yandex.com"""
+ data, mime = (None, None)
+ url = f"https://favicon.yandex.net/favicon/{domain}"
+ logger.debug("fetch favicon from: %s", url)
+
+ # api will respond with a 16x16 png image, if it doesn't exist, it will be a
+ # 1x1 png image (70 bytes)
+ response = network.get(url, **_req_args(timeout=timeout))
+ if response and response.status_code == 200 and len(response.content) > 70:
+ mime = response.headers['Content-Type']
+ data = response.content
+ return data, mime
+
+
+DEFAULT_RESOLVER_MAP = {
+ "allesedv": allesedv,
+ "duckduckgo": duckduckgo,
+ "google": google,
+ "yandex": yandex,
+}