add get_embeded_stream_url to searx.utils

This commit is contained in:
Austin-Olacsi 2024-09-14 16:28:35 -06:00 committed by Markus Heiser
parent f07ab6deb0
commit cbf1e90979
5 changed files with 56 additions and 14 deletions

View file

@ -123,7 +123,6 @@ from typing import Any, TYPE_CHECKING
from urllib.parse import ( from urllib.parse import (
urlencode, urlencode,
urlparse, urlparse,
parse_qs,
) )
from dateutil import parser from dateutil import parser
@ -137,6 +136,7 @@ from searx.utils import (
eval_xpath_list, eval_xpath_list,
eval_xpath_getindex, eval_xpath_getindex,
js_variable_to_python, js_variable_to_python,
get_embeded_stream_url,
) )
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
@ -311,7 +311,7 @@ def _parse_search(resp):
# In my tests a video tag in the WEB search was most often not a # In my tests a video tag in the WEB search was most often not a
# video, except the ones from youtube .. # video, except the ones from youtube ..
iframe_src = _get_iframe_src(url) iframe_src = get_embeded_stream_url(url)
if iframe_src: if iframe_src:
item['iframe_src'] = iframe_src item['iframe_src'] = iframe_src
item['template'] = 'videos.html' item['template'] = 'videos.html'
@ -328,15 +328,6 @@ def _parse_search(resp):
return result_list return result_list
def _get_iframe_src(url):
parsed_url = urlparse(url)
if parsed_url.path == '/watch' and parsed_url.query:
video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore
if video_id:
return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore
return None
def _parse_news(json_resp): def _parse_news(json_resp):
result_list = [] result_list = []
@ -392,7 +383,7 @@ def _parse_videos(json_resp):
if result['thumbnail'] is not None: if result['thumbnail'] is not None:
item['thumbnail'] = result['thumbnail']['src'] item['thumbnail'] = result['thumbnail']['src']
iframe_src = _get_iframe_src(url) iframe_src = get_embeded_stream_url(url)
if iframe_src: if iframe_src:
item['iframe_src'] = iframe_src item['iframe_src'] = iframe_src

View file

@ -7,6 +7,7 @@ DuckDuckGo Extra (images, videos, news)
from datetime import datetime from datetime import datetime
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from urllib.parse import urlencode from urllib.parse import urlencode
from searx.utils import get_embeded_stream_url
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import ( from searx.engines.duckduckgo import (
@ -108,7 +109,7 @@ def _video_result(result):
'title': result['title'], 'title': result['title'],
'content': result['description'], 'content': result['description'],
'thumbnail': result['images'].get('small') or result['images'].get('medium'), 'thumbnail': result['images'].get('small') or result['images'].get('medium'),
'iframe_src': result['embed_url'], 'iframe_src': get_embeded_stream_url(result['content']),
'source': result['provider'], 'source': result['provider'],
'length': result['duration'], 'length': result['duration'],
'metadata': result.get('uploader'), 'metadata': result.get('uploader'),

View file

@ -34,6 +34,7 @@ from searx.engines.google import (
detect_google_sorry, detect_google_sorry,
) )
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
from searx.utils import get_embeded_stream_url
if TYPE_CHECKING: if TYPE_CHECKING:
import logging import logging
@ -125,6 +126,7 @@ def response(resp):
'content': content, 'content': content,
'author': pub_info, 'author': pub_info,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'iframe_src': get_embeded_stream_url(url),
'template': 'videos.html', 'template': 'videos.html',
} }
) )

View file

@ -61,6 +61,7 @@ from searx.utils import (
eval_xpath, eval_xpath,
eval_xpath_list, eval_xpath_list,
extract_text, extract_text,
get_embeded_stream_url,
) )
traits: EngineTraits traits: EngineTraits
@ -303,6 +304,7 @@ def parse_web_api(resp):
'title': title, 'title': title,
'url': res_url, 'url': res_url,
'content': content, 'content': content,
'iframe_src': get_embeded_stream_url(res_url),
'publishedDate': pub_date, 'publishedDate': pub_date,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'template': 'videos.html', 'template': 'videos.html',

View file

@ -17,7 +17,7 @@ from os.path import splitext, join
from random import choice from random import choice
from html.parser import HTMLParser from html.parser import HTMLParser
from html import escape from html import escape
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse, parse_qs, urlencode
from markdown_it import MarkdownIt from markdown_it import MarkdownIt
from lxml import html from lxml import html
@ -615,6 +615,52 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText": # type: ignore
return _FASTTEXT_MODEL return _FASTTEXT_MODEL
def get_embeded_stream_url(url):
"""
Converts a standard video URL into its embed format. Supported services include Youtube,
Facebook, Instagram, TikTok, and Dailymotion.
"""
parsed_url = urlparse(url)
iframe_src = None
# YouTube
if parsed_url.netloc in ['www.youtube.com', 'youtube.com'] and parsed_url.path == '/watch' and parsed_url.query:
video_id = parse_qs(parsed_url.query).get('v', [])
if video_id:
iframe_src = 'https://www.youtube-nocookie.com/embed/' + video_id[0]
# Facebook
elif parsed_url.netloc in ['www.facebook.com', 'facebook.com']:
encoded_href = urlencode({'href': url})
iframe_src = 'https://www.facebook.com/plugins/video.php?allowfullscreen=true&' + encoded_href
# Instagram
elif parsed_url.netloc in ['www.instagram.com', 'instagram.com'] and parsed_url.path.startswith('/p/'):
if parsed_url.path.endswith('/'):
iframe_src = url + 'embed'
else:
iframe_src = url + '/embed'
# TikTok
elif (
parsed_url.netloc in ['www.tiktok.com', 'tiktok.com']
and parsed_url.path.startswith('/@')
and '/video/' in parsed_url.path
):
path_parts = parsed_url.path.split('/video/')
video_id = path_parts[1]
iframe_src = 'https://www.tiktok.com/embed/' + video_id
# Dailymotion
elif parsed_url.netloc in ['www.dailymotion.com', 'dailymotion.com'] and parsed_url.path.startswith('/video/'):
path_parts = parsed_url.path.split('/')
if len(path_parts) == 3:
video_id = path_parts[2]
iframe_src = 'https://www.dailymotion.com/embed/video/' + video_id
return iframe_src
def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]: def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
"""Detect the language of the ``text`` parameter. """Detect the language of the ``text`` parameter.