Files
searxng/searx/engines/duckduckgo_definitions.py

268 lines
11 KiB
Python
Raw Normal View History

# SPDX-License-Identifier: AGPL-3.0-or-later
[mod] DuckDuckGo: reversed engineered & upgrade to data_type: traits_v1 Partial reverse engineering of the DuckDuckGo (DDG) engines including a improved language and region handling based on the enigne.traits_v1 data. - DDG Lite - DDG Instant Answer API - DDG Images - DDG Weather docs/src/searx.engine.duckduckgo.rst: Online documentation of the DDG engines (make docs.live) searx/data/engine_traits.json Add data type "traits_v1" generated by the fetch_traits() functions from: - "duckduckgo" (WEB), - "duckduckgo images" and - "duckduckgo weather" and remove data from obsolete data type "supported_languages". searx/autocomplete.py: Reversed engineered Autocomplete from DDG. Supports DDG's languages. searx/engines/duckduckgo.py: - fetch_traits(): Fetch languages & regions from DDG. - get_ddg_lang(): Get DDG's language identifier from SearXNG's locale. DDG defines its languages by region codes. DDG-Lite does not offer a language selection to the user, only a region can be selected by the user. - Cache ``vqd`` value: The vqd value depends on the query string and is needed for the follow up pages or the images loaded by a XMLHttpRequest (DDG images). The ``vqd`` value of a search term is stored for 10min in the redis DB. - DDG Lite engine: reversed engineered request method with improved Language and region support and better ``vqd`` handling. searx/engines/duckduckgo_definitions.py: DDG Instant Answer API The *instant answers* API does not support languages, or at least we could not find out how language support should work. It seems that most of the features are based on English terms. searx/engines/duckduckgo_images.py: DDG Images Reversed engineered request method. Improved language and region handling based on cookies and the enigne.traits_v1 data. Response: add image format to the result list searx/engines/duckduckgo_weather.py: DDG Weather Improved language and region handling based on cookies and the enigne.traits_v1 data. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2022-11-05 15:10:52 +01:00
"""
DuckDuckGo Instant Answer API
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from
reverse engineering we can see that some services (e.g. instant answers) still
in use from the DDG search engine.
As far we can say the *instant answers* API does not support languages, or at
least we could not find out how language support should work. It seems that
most of the features are based on English terms.
"""
import typing as t
from urllib.parse import urlencode, urlparse, urljoin
2014-09-28 16:51:41 +02:00
from lxml import html
from searx.data import WIKIDATA_UNITS
[mod] DuckDuckGo: reversed engineered & upgrade to data_type: traits_v1 Partial reverse engineering of the DuckDuckGo (DDG) engines including a improved language and region handling based on the enigne.traits_v1 data. - DDG Lite - DDG Instant Answer API - DDG Images - DDG Weather docs/src/searx.engine.duckduckgo.rst: Online documentation of the DDG engines (make docs.live) searx/data/engine_traits.json Add data type "traits_v1" generated by the fetch_traits() functions from: - "duckduckgo" (WEB), - "duckduckgo images" and - "duckduckgo weather" and remove data from obsolete data type "supported_languages". searx/autocomplete.py: Reversed engineered Autocomplete from DDG. Supports DDG's languages. searx/engines/duckduckgo.py: - fetch_traits(): Fetch languages & regions from DDG. - get_ddg_lang(): Get DDG's language identifier from SearXNG's locale. DDG defines its languages by region codes. DDG-Lite does not offer a language selection to the user, only a region can be selected by the user. - Cache ``vqd`` value: The vqd value depends on the query string and is needed for the follow up pages or the images loaded by a XMLHttpRequest (DDG images). The ``vqd`` value of a search term is stored for 10min in the redis DB. - DDG Lite engine: reversed engineered request method with improved Language and region support and better ``vqd`` handling. searx/engines/duckduckgo_definitions.py: DDG Instant Answer API The *instant answers* API does not support languages, or at least we could not find out how language support should work. It seems that most of the features are based on English terms. searx/engines/duckduckgo_images.py: DDG Images Reversed engineered request method. Improved language and region handling based on cookies and the enigne.traits_v1 data. Response: add image format to the result list searx/engines/duckduckgo_weather.py: DDG Weather Improved language and region handling based on cookies and the enigne.traits_v1 data. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2022-11-05 15:10:52 +01:00
from searx.utils import extract_text, html_to_text, get_string_replaces_function
from searx.external_urls import (
get_external_url,
get_earth_coordinates_url,
area_to_osm_zoom,
)
from searx.result_types import EngineResults
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
about = {
"website": "https://duckduckgo.com/",
"wikidata_id": "Q12805",
"official_api_documentation": "https://duckduckgo.com/api",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
URL = "https://api.duckduckgo.com/" + "?{query}&format=json&pretty=0&no_redirect=1&d=1"
WIKIDATA_PREFIX = ["http://www.wikidata.org/entity/", "https://www.wikidata.org/entity/"]
replace_http_by_https = get_string_replaces_function({"http:": "https:"})
def is_broken_text(text: str) -> bool:
[mod] DuckDuckGo: reversed engineered & upgrade to data_type: traits_v1 Partial reverse engineering of the DuckDuckGo (DDG) engines including a improved language and region handling based on the enigne.traits_v1 data. - DDG Lite - DDG Instant Answer API - DDG Images - DDG Weather docs/src/searx.engine.duckduckgo.rst: Online documentation of the DDG engines (make docs.live) searx/data/engine_traits.json Add data type "traits_v1" generated by the fetch_traits() functions from: - "duckduckgo" (WEB), - "duckduckgo images" and - "duckduckgo weather" and remove data from obsolete data type "supported_languages". searx/autocomplete.py: Reversed engineered Autocomplete from DDG. Supports DDG's languages. searx/engines/duckduckgo.py: - fetch_traits(): Fetch languages & regions from DDG. - get_ddg_lang(): Get DDG's language identifier from SearXNG's locale. DDG defines its languages by region codes. DDG-Lite does not offer a language selection to the user, only a region can be selected by the user. - Cache ``vqd`` value: The vqd value depends on the query string and is needed for the follow up pages or the images loaded by a XMLHttpRequest (DDG images). The ``vqd`` value of a search term is stored for 10min in the redis DB. - DDG Lite engine: reversed engineered request method with improved Language and region support and better ``vqd`` handling. searx/engines/duckduckgo_definitions.py: DDG Instant Answer API The *instant answers* API does not support languages, or at least we could not find out how language support should work. It seems that most of the features are based on English terms. searx/engines/duckduckgo_images.py: DDG Images Reversed engineered request method. Improved language and region handling based on cookies and the enigne.traits_v1 data. Response: add image format to the result list searx/engines/duckduckgo_weather.py: DDG Weather Improved language and region handling based on cookies and the enigne.traits_v1 data. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2022-11-05 15:10:52 +01:00
"""duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``
2016-04-18 10:52:16 -05:00
The href URL is broken, the "Related website" may contains some HTML.
2013-10-14 23:54:33 +02:00
The best solution seems to ignore these results.
"""
return text.startswith("http") and " " in text
def result_to_text(text: str, htmlResult: str) -> str | None:
# TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme
result = ""
2014-09-28 16:51:41 +02:00
dom = html.fromstring(htmlResult)
a = dom.xpath("//a")
if len(a) >= 1:
result = extract_text(a[0])
2014-09-28 16:51:41 +02:00
else:
result = text
if result and not is_broken_text(result):
return result
return None
2014-09-28 16:51:41 +02:00
def request(query: str, params: "OnlineParams") -> None:
params["url"] = URL.format(query=urlencode({"q": query}))
2013-10-14 23:54:33 +02:00
def response(resp: "SXNG_Response") -> EngineResults:
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
results = EngineResults()
search_res: dict[str, str] = resp.json()
2014-09-28 16:51:41 +02:00
# search_res.get("Entity") possible values (not exhaustive) :
# * continent / country / department / location / waterfall
# * actor / musician / artist
# * book / performing art / film / television / media franchise / concert tour / playwright
# * prepared food
# * website / software / os / programming language / file format / software engineer
# * company
content: str = ""
heading: str = search_res.get("Heading", "")
attributes: list[dict[str, str | dict[str, str]]] = []
urls: list[dict[str, str | bool]] = []
2014-09-28 16:51:41 +02:00
infobox_id = None
relatedTopics: list[dict[str, str | list[str]]] = []
2014-09-28 16:51:41 +02:00
# add answer if there is one
answer: str = search_res.get("Answer", "")
if answer:
answer_type = search_res.get("AnswerType")
logger.debug("AnswerType='%s' Answer='%s'", answer_type, answer)
if isinstance(answer, str) and answer_type not in ["calc", "ip"]:
results.add(
results.types.Answer(
answer=html_to_text(answer),
url=search_res.get("AbstractURL", ""),
)
)
2014-09-28 16:51:41 +02:00
# add infobox
if "Definition" in search_res:
content = content + search_res.get("Definition", "")
2014-09-28 16:51:41 +02:00
if "Abstract" in search_res:
content = content + search_res.get("Abstract", "")
2014-09-28 16:51:41 +02:00
# image
image = search_res.get("Image")
image = None if image == "" else image
if image is not None and urlparse(image).netloc == "":
image = urljoin("https://duckduckgo.com", image)
2014-09-28 16:51:41 +02:00
# Official website, Wikipedia page
_result_list: list[dict[str, str]] = search_res.get("Results", []) # pyright: ignore[reportAssignmentType]
for ddg_result in _result_list:
firstURL = ddg_result.get("FirstURL")
text = ddg_result.get("Text")
if firstURL is not None and text is not None:
urls.append({"title": text, "url": firstURL})
results.add(results.types.LegacyResult({"title": heading, "url": firstURL}))
2014-09-28 16:51:41 +02:00
# related topics
_result_list = search_res.get("RelatedTopics", []) # pyright: ignore[reportAssignmentType]
for ddg_result in _result_list:
if "FirstURL" in ddg_result:
firstURL = ddg_result.get("FirstURL")
text = ddg_result.get("Text", "")
if not is_broken_text(text):
suggestion = result_to_text(text, ddg_result.get("Result", ""))
if suggestion != heading and suggestion is not None:
results.add(results.types.LegacyResult({"suggestion": suggestion}))
elif "Topics" in ddg_result:
suggestions: list[str] = []
relatedTopics.append({"name": ddg_result.get("Name", ""), "suggestions": suggestions})
_topic_results: list[dict[str, str]] = ddg_result.get("Topics", []) # pyright: ignore[reportAssignmentType]
for topic_result in _topic_results:
suggestion = result_to_text(topic_result.get("Text", ""), topic_result.get("Result", ""))
if suggestion != heading and suggestion is not None:
2014-09-28 16:51:41 +02:00
suggestions.append(suggestion)
# abstract
abstractURL = search_res.get("AbstractURL", "")
if abstractURL != "":
2014-09-28 16:51:41 +02:00
# add as result ? problem always in english
infobox_id = abstractURL
urls.append({"title": search_res.get("AbstractSource", ""), "url": abstractURL, "official": True})
results.add(results.types.LegacyResult({"url": abstractURL, "title": heading}))
2014-09-28 16:51:41 +02:00
# definition
definitionURL = search_res.get("DefinitionURL", "")
if definitionURL != "":
2014-09-28 16:51:41 +02:00
# add as result ? as answer ? problem always in english
infobox_id = definitionURL
urls.append({"title": search_res.get("DefinitionSource", ""), "url": definitionURL})
2014-09-28 16:51:41 +02:00
# to merge with wikidata's infobox
if infobox_id:
infobox_id = replace_http_by_https(infobox_id)
# attributes
# some will be converted to urls
if "Infobox" in search_res:
infobox: dict[str, t.Any] = search_res.get("Infobox", {}) # pyright: ignore[reportAssignmentType]
if "content" in infobox:
osm_zoom = 17
coordinates = None
for info in infobox.get("content", {}):
data_type: str = info.get("data_type", "")
data_label = info.get("label")
data_value = info.get("value")
# Workaround: ddg may return a double quote
if data_value == '""':
continue
# Is it an external URL ?
# * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
# * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
# * netflix_id
external_url: str | None = get_external_url(data_type, data_value) # type: ignore
if external_url is not None:
urls.append({"title": data_label, "url": external_url})
elif data_type in ["instance", "wiki_maps_trigger", "google_play_artist_id"]:
# ignore instance: Wikidata value from "Instance Of" (Qxxxx)
# ignore wiki_maps_trigger: reference to a javascript
# ignore google_play_artist_id: service shutdown
pass
elif data_type == "string" and data_label == "Website":
# There is already an URL for the website
pass
elif data_type == "area":
attributes.append({"label": data_label, "value": area_to_str(data_value), "entity": "P2046"})
osm_zoom = area_to_osm_zoom(data_value.get("amount"))
elif data_type == "coordinates":
if data_value.get("globe") == "http://www.wikidata.org/entity/Q2":
# coordinate on Earth
# get the zoom information from the area
coordinates = info
else:
# coordinate NOT on Earth
attributes.append({"label": data_label, "value": data_value, "entity": "P625"})
elif data_type == "string":
attributes.append({"label": data_label, "value": data_value})
if coordinates:
data_label = coordinates.get("label")
data_value = coordinates.get("value")
latitude = data_value.get("latitude")
longitude = data_value.get("longitude")
_url: str = get_earth_coordinates_url(latitude, longitude, osm_zoom) # type: ignore
urls.append({"title": "OpenStreetMap", "url": _url, "entity": "P625"})
2014-09-28 16:51:41 +02:00
if len(heading) > 0:
# TODO get infobox.meta.value where .label="article_title" # pylint: disable=fixme
if image is None and len(attributes) == 0 and len(urls) == 1 and len(relatedTopics) == 0 and len(content) == 0:
results.add(results.types.LegacyResult({"url": urls[0]["url"], "title": heading, "content": content}))
else:
results.add(
results.types.LegacyResult(
{
"infobox": heading,
"id": infobox_id,
"content": content,
"img_src": image,
"attributes": attributes,
"urls": urls,
"relatedTopics": relatedTopics,
}
)
)
2013-10-14 23:54:33 +02:00
return results
def unit_to_str(unit: str) -> str:
for prefix in WIKIDATA_PREFIX:
if unit.startswith(prefix):
wikidata_entity = unit[len(prefix) :]
real_unit = WIKIDATA_UNITS.get(wikidata_entity)
if real_unit is None:
return unit
return real_unit["symbol"]
return unit
def area_to_str(area: dict[str, str]) -> str:
"""parse ``{"unit": "https://www.wikidata.org/entity/Q712226", "amount": "+20.99"}``"""
unit = unit_to_str(area.get("unit", ""))
if unit:
try:
amount = float(area.get("amount", ""))
return "{} {}".format(amount, unit)
except ValueError:
pass
return "{} {}".format(area.get("amount", ""), area.get("unit", ""))