summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAustin-Olacsi <138650713+Austin-Olacsi@users.noreply.github.com>2024-09-14 16:28:35 -0600
committerMarkus Heiser <markus.heiser@darmarIT.de>2024-10-03 07:10:53 +0200
commitcbf1e9097929cf851d31bfd17e87bec7d1e51422 (patch)
tree70532240b01da30e7acd54c86e53825ac9a10135
parentf07ab6deb0f43a2d08f4f12335481825c6aa77ac (diff)
downloadsearxng-cbf1e9097929cf851d31bfd17e87bec7d1e51422.tar.gz
searxng-cbf1e9097929cf851d31bfd17e87bec7d1e51422.zip
add get_embeded_stream_url to searx.utils
-rw-r--r--searx/engines/brave.py15
-rw-r--r--searx/engines/duckduckgo_extra.py3
-rw-r--r--searx/engines/google_videos.py2
-rw-r--r--searx/engines/qwant.py2
-rw-r--r--searx/utils.py48
5 files changed, 56 insertions, 14 deletions
diff --git a/searx/engines/brave.py b/searx/engines/brave.py
index 6f7e342e7..648aee562 100644
--- a/searx/engines/brave.py
+++ b/searx/engines/brave.py
@@ -123,7 +123,6 @@ from typing import Any, TYPE_CHECKING
from urllib.parse import (
urlencode,
urlparse,
- parse_qs,
)
from dateutil import parser
@@ -137,6 +136,7 @@ from searx.utils import (
eval_xpath_list,
eval_xpath_getindex,
js_variable_to_python,
+ get_embeded_stream_url,
)
from searx.enginelib.traits import EngineTraits
@@ -311,7 +311,7 @@ def _parse_search(resp):
# In my tests a video tag in the WEB search was most often not a
# video, except the ones from youtube ..
- iframe_src = _get_iframe_src(url)
+ iframe_src = get_embeded_stream_url(url)
if iframe_src:
item['iframe_src'] = iframe_src
item['template'] = 'videos.html'
@@ -328,15 +328,6 @@ def _parse_search(resp):
return result_list
-def _get_iframe_src(url):
- parsed_url = urlparse(url)
- if parsed_url.path == '/watch' and parsed_url.query:
- video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore
- if video_id:
- return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore
- return None
-
-
def _parse_news(json_resp):
result_list = []
@@ -392,7 +383,7 @@ def _parse_videos(json_resp):
if result['thumbnail'] is not None:
item['thumbnail'] = result['thumbnail']['src']
- iframe_src = _get_iframe_src(url)
+ iframe_src = get_embeded_stream_url(url)
if iframe_src:
item['iframe_src'] = iframe_src
diff --git a/searx/engines/duckduckgo_extra.py b/searx/engines/duckduckgo_extra.py
index 83ca38c26..b30574d6c 100644
--- a/searx/engines/duckduckgo_extra.py
+++ b/searx/engines/duckduckgo_extra.py
@@ -7,6 +7,7 @@ DuckDuckGo Extra (images, videos, news)
from datetime import datetime
from typing import TYPE_CHECKING
from urllib.parse import urlencode
+from searx.utils import get_embeded_stream_url
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import (
@@ -108,7 +109,7 @@ def _video_result(result):
'title': result['title'],
'content': result['description'],
'thumbnail': result['images'].get('small') or result['images'].get('medium'),
- 'iframe_src': result['embed_url'],
+ 'iframe_src': get_embeded_stream_url(result['content']),
'source': result['provider'],
'length': result['duration'],
'metadata': result.get('uploader'),
diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py
index 4a032ef0f..c8fc934af 100644
--- a/searx/engines/google_videos.py
+++ b/searx/engines/google_videos.py
@@ -34,6 +34,7 @@ from searx.engines.google import (
detect_google_sorry,
)
from searx.enginelib.traits import EngineTraits
+from searx.utils import get_embeded_stream_url
if TYPE_CHECKING:
import logging
@@ -125,6 +126,7 @@ def response(resp):
'content': content,
'author': pub_info,
'thumbnail': thumbnail,
+ 'iframe_src': get_embeded_stream_url(url),
'template': 'videos.html',
}
)
diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py
index c30018d85..7ad6cf58a 100644
--- a/searx/engines/qwant.py
+++ b/searx/engines/qwant.py
@@ -61,6 +61,7 @@ from searx.utils import (
eval_xpath,
eval_xpath_list,
extract_text,
+ get_embeded_stream_url,
)
traits: EngineTraits
@@ -303,6 +304,7 @@ def parse_web_api(resp):
'title': title,
'url': res_url,
'content': content,
+ 'iframe_src': get_embeded_stream_url(res_url),
'publishedDate': pub_date,
'thumbnail': thumbnail,
'template': 'videos.html',
diff --git a/searx/utils.py b/searx/utils.py
index 407d44cd0..c0c6261f9 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -17,7 +17,7 @@ from os.path import splitext, join
from random import choice
from html.parser import HTMLParser
from html import escape
-from urllib.parse import urljoin, urlparse
+from urllib.parse import urljoin, urlparse, parse_qs, urlencode
from markdown_it import MarkdownIt
from lxml import html
@@ -615,6 +615,52 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText": # type: ignore
return _FASTTEXT_MODEL
+def get_embeded_stream_url(url):
+ """
+ Converts a standard video URL into its embed format. Supported services include Youtube,
+ Facebook, Instagram, TikTok, and Dailymotion.
+ """
+ parsed_url = urlparse(url)
+ iframe_src = None
+
+ # YouTube
+ if parsed_url.netloc in ['www.youtube.com', 'youtube.com'] and parsed_url.path == '/watch' and parsed_url.query:
+ video_id = parse_qs(parsed_url.query).get('v', [])
+ if video_id:
+ iframe_src = 'https://www.youtube-nocookie.com/embed/' + video_id[0]
+
+ # Facebook
+ elif parsed_url.netloc in ['www.facebook.com', 'facebook.com']:
+ encoded_href = urlencode({'href': url})
+ iframe_src = 'https://www.facebook.com/plugins/video.php?allowfullscreen=true&' + encoded_href
+
+ # Instagram
+ elif parsed_url.netloc in ['www.instagram.com', 'instagram.com'] and parsed_url.path.startswith('/p/'):
+ if parsed_url.path.endswith('/'):
+ iframe_src = url + 'embed'
+ else:
+ iframe_src = url + '/embed'
+
+ # TikTok
+ elif (
+ parsed_url.netloc in ['www.tiktok.com', 'tiktok.com']
+ and parsed_url.path.startswith('/@')
+ and '/video/' in parsed_url.path
+ ):
+ path_parts = parsed_url.path.split('/video/')
+ video_id = path_parts[1]
+ iframe_src = 'https://www.tiktok.com/embed/' + video_id
+
+ # Dailymotion
+ elif parsed_url.netloc in ['www.dailymotion.com', 'dailymotion.com'] and parsed_url.path.startswith('/video/'):
+ path_parts = parsed_url.path.split('/')
+ if len(path_parts) == 3:
+ video_id = path_parts[2]
+ iframe_src = 'https://www.dailymotion.com/embed/video/' + video_id
+
+ return iframe_src
+
+
def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
"""Detect the language of the ``text`` parameter.