The Sec-Fetch-* headers seem to cause more problems than they solve. They will be removed for now. Related: - https://github.com/searxng/searxng/pull/5758#pullrequestreview-3834221131 Suggested-by: @Bnyro Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
199 lines
5.8 KiB
Python
199 lines
5.8 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
"""
|
|
DuckDuckGo Extra (images, videos, news)
|
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
"""
|
|
|
|
import typing as t
|
|
|
|
from datetime import datetime
|
|
from urllib.parse import urlencode
|
|
from urllib.parse import quote_plus
|
|
|
|
from searx.utils import get_embeded_stream_url, html_to_text, gen_useragent, extr
|
|
from searx.network import get # see https://github.com/searxng/searxng/issues/762
|
|
|
|
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
|
|
from searx.engines.duckduckgo import get_ddg_lang, get_vqd, set_vqd
|
|
|
|
if t.TYPE_CHECKING:
|
|
from searx.extended_types import SXNG_Response
|
|
from searx.search.processors import OnlineParams
|
|
|
|
# about
|
|
about = {
|
|
"website": "https://duckduckgo.com/",
|
|
"wikidata_id": "Q12805",
|
|
"use_official_api": False,
|
|
"require_api_key": False,
|
|
"results": "JSON (site requires js to get images)",
|
|
}
|
|
|
|
# engine dependent config
|
|
categories = []
|
|
ddg_category = ""
|
|
"""The category must be any of ``images``, ``videos`` and ``news``
|
|
"""
|
|
paging = True
|
|
safesearch = True
|
|
|
|
safesearch_cookies = {0: "-2", 1: None, 2: "1"}
|
|
safesearch_args = {0: "1", 1: None, 2: "1"}
|
|
|
|
search_path_map = {"images": "i", "videos": "v", "news": "news"}
|
|
_HTTP_User_Agent: str = gen_useragent()
|
|
|
|
|
|
def init(engine_settings: dict[str, t.Any]):
|
|
|
|
if engine_settings["ddg_category"] not in ["images", "videos", "news"]:
|
|
raise ValueError(f"Unsupported DuckDuckGo category: {engine_settings['ddg_category']}")
|
|
|
|
|
|
def fetch_vqd(
|
|
query: str,
|
|
params: "OnlineParams",
|
|
):
|
|
|
|
logger.debug("fetch_vqd: request value from from duckduckgo.com")
|
|
resp = get(
|
|
url=f"https://duckduckgo.com/?q={quote_plus(query)}&iar=images&t=h_",
|
|
headers=params["headers"],
|
|
timeout=2,
|
|
)
|
|
|
|
value = ""
|
|
if resp.status_code == 200:
|
|
value = extr(resp.text, 'vqd="', '"')
|
|
if value:
|
|
logger.debug("vqd value from duckduckgo.com request: '%s'", value)
|
|
else:
|
|
logger.error("vqd: can't parse value from ddg response (return empty string)")
|
|
return ""
|
|
else:
|
|
logger.error("vqd: got HTTP %s from duckduckgo.com", resp.status_code)
|
|
|
|
if value:
|
|
set_vqd(query=query, value=value, params=params)
|
|
else:
|
|
logger.error("none vqd value from duckduckgo.com: HTTP %s", resp.status_code)
|
|
return value
|
|
|
|
|
|
def request(query: str, params: "OnlineParams") -> None:
|
|
|
|
if len(query) >= 500:
|
|
# DDG does not accept queries with more than 499 chars
|
|
params["url"] = None
|
|
return
|
|
|
|
# HTTP headers
|
|
# ============
|
|
|
|
headers = params["headers"]
|
|
# The vqd value is generated from the query and the UA header. To be able to
|
|
# reuse the vqd value, the UA header must be static.
|
|
headers["User-Agent"] = _HTTP_User_Agent
|
|
vqd = get_vqd(query=query, params=params) or fetch_vqd(query=query, params=params)
|
|
|
|
headers["Accept"] = "*/*"
|
|
headers["Referer"] = "https://duckduckgo.com/"
|
|
headers["Host"] = "duckduckgo.com"
|
|
# headers["X-Requested-With"] = "XMLHttpRequest"
|
|
|
|
# DDG XHTMLRequest
|
|
# ================
|
|
|
|
eng_region: str = traits.get_region(
|
|
params["searxng_locale"],
|
|
traits.all_locale,
|
|
) # pyright: ignore[reportAssignmentType]
|
|
|
|
eng_lang: str = get_ddg_lang(traits, params["searxng_locale"]) or "wt-wt"
|
|
|
|
args: dict[str, str | int] = {
|
|
"o": "json",
|
|
"q": query,
|
|
"u": "bing",
|
|
"l": eng_region,
|
|
"bpia": "1",
|
|
"vqd": vqd,
|
|
"a": "h_",
|
|
}
|
|
|
|
params["cookies"]["ad"] = eng_lang # zh_CN
|
|
params["cookies"]["ah"] = eng_region # "us-en,de-de"
|
|
params["cookies"]["l"] = eng_region # "hk-tzh"
|
|
|
|
args["ct"] = "EN"
|
|
if params["searxng_locale"] != "all":
|
|
args["ct"] = params["searxng_locale"].split("-")[0].upper()
|
|
|
|
if params["pageno"] > 1:
|
|
args["s"] = (params["pageno"] - 1) * 100
|
|
|
|
safe_search = safesearch_cookies.get(params["safesearch"])
|
|
if safe_search is not None:
|
|
params["cookies"]["p"] = safe_search # "-2", "1"
|
|
args["p"] = safe_search
|
|
|
|
params["url"] = f"https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}"
|
|
|
|
logger.debug("param headers: %s", params["headers"])
|
|
logger.debug("param data: %s", params["data"])
|
|
logger.debug("param cookies: %s", params["cookies"])
|
|
|
|
|
|
def _image_result(result):
|
|
return {
|
|
'template': 'images.html',
|
|
'url': result['url'],
|
|
'title': result['title'],
|
|
'content': '',
|
|
'thumbnail_src': result['thumbnail'],
|
|
'img_src': result['image'],
|
|
'resolution': '%s x %s' % (result['width'], result['height']),
|
|
'source': result['source'],
|
|
}
|
|
|
|
|
|
def _video_result(result):
|
|
return {
|
|
'template': 'videos.html',
|
|
'url': result['content'],
|
|
'title': result['title'],
|
|
'content': result['description'],
|
|
'thumbnail': result['images'].get('small') or result['images'].get('medium'),
|
|
'iframe_src': get_embeded_stream_url(result['content']),
|
|
'source': result['provider'],
|
|
'length': result['duration'],
|
|
'metadata': result.get('uploader'),
|
|
}
|
|
|
|
|
|
def _news_result(result):
|
|
return {
|
|
'url': result['url'],
|
|
'title': result['title'],
|
|
'content': html_to_text(result['excerpt']),
|
|
'source': result['source'],
|
|
'publishedDate': datetime.fromtimestamp(result['date']),
|
|
}
|
|
|
|
|
|
def response(resp):
|
|
results = []
|
|
res_json = resp.json()
|
|
|
|
for result in res_json['results']:
|
|
if ddg_category == 'images':
|
|
results.append(_image_result(result))
|
|
elif ddg_category == 'videos':
|
|
results.append(_video_result(result))
|
|
elif ddg_category == 'news':
|
|
results.append(_news_result(result))
|
|
else:
|
|
raise ValueError(f"Invalid duckduckgo category: {ddg_category}")
|
|
|
|
return results
|