[feat] engines: add support for aol.com (#5882)
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
8
docs/dev/engines/online/aol.rst
Normal file
8
docs/dev/engines/online/aol.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
.. _aol engine:
|
||||||
|
|
||||||
|
===
|
||||||
|
AOL
|
||||||
|
===
|
||||||
|
|
||||||
|
.. automodule:: searx.engines.aol
|
||||||
|
:members:
|
||||||
208
searx/engines/aol.py
Normal file
208
searx/engines/aol.py
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
"""AOL supports WEB, image, and video search. Internally, it uses the Bing
|
||||||
|
index.
|
||||||
|
|
||||||
|
AOL doesn't seem to support setting the language via request parameters, instead
|
||||||
|
the results are based on the URL. For example, there is
|
||||||
|
|
||||||
|
- `search.aol.com <https://search.aol.com>`_ for English results
|
||||||
|
- `suche.aol.de <https://suche.aol.de>`_ for German results
|
||||||
|
|
||||||
|
However, AOL offers its services only in a few regions:
|
||||||
|
|
||||||
|
- en-US: search.aol.com
|
||||||
|
- de-DE: suche.aol.de
|
||||||
|
- fr-FR: recherche.aol.fr
|
||||||
|
- en-GB: search.aol.co.uk
|
||||||
|
- en-CA: search.aol.ca
|
||||||
|
|
||||||
|
In order to still offer sufficient support for language and region, the `search
|
||||||
|
keywords`_ known from Bing, ``language`` and ``loc`` (region), are added to the
|
||||||
|
search term (AOL is basically just a proxy for Bing).
|
||||||
|
|
||||||
|
.. _search keywords:
|
||||||
|
https://support.microsoft.com/en-us/topic/advanced-search-keywords-ea595928-5d63-4a0b-9c6b-0b769865e78a
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from urllib.parse import urlencode, unquote_plus
|
||||||
|
import typing as t
|
||||||
|
|
||||||
|
from lxml import html
|
||||||
|
from dateutil import parser
|
||||||
|
|
||||||
|
from searx.result_types import EngineResults
|
||||||
|
from searx.utils import eval_xpath_list, eval_xpath, extract_text
|
||||||
|
|
||||||
|
if t.TYPE_CHECKING:
|
||||||
|
from searx.extended_types import SXNG_Response
|
||||||
|
from searx.search.processors import OnlineParams
|
||||||
|
|
||||||
|
about = {
|
||||||
|
"website": "https://www.aol.com",
|
||||||
|
"wikidata_id": "Q2407",
|
||||||
|
"official_api_documentation": None,
|
||||||
|
"use_official_api": False,
|
||||||
|
"require_api_key": False,
|
||||||
|
"results": "HTML",
|
||||||
|
}
|
||||||
|
|
||||||
|
categories = ["general"]
|
||||||
|
search_type = "search" # supported: search, image, video
|
||||||
|
|
||||||
|
paging = True
|
||||||
|
safesearch = True
|
||||||
|
time_range_support = True
|
||||||
|
results_per_page = 10
|
||||||
|
|
||||||
|
|
||||||
|
base_url = "https://search.aol.com"
|
||||||
|
time_range_map = {"day": "1d", "week": "1w", "month": "1m", "year": "1y"}
|
||||||
|
safesearch_map = {0: "p", 1: "r", 2: "i"}
|
||||||
|
|
||||||
|
|
||||||
|
def init(_):
|
||||||
|
if search_type not in ("search", "image", "video"):
|
||||||
|
raise ValueError(f"unsupported search type {search_type}")
|
||||||
|
|
||||||
|
|
||||||
|
def request(query: str, params: "OnlineParams") -> None:
|
||||||
|
|
||||||
|
language, region = (params["searxng_locale"].split("-") + [None])[:2]
|
||||||
|
if language and language != "all":
|
||||||
|
query = f"{query} language:{language}"
|
||||||
|
if region:
|
||||||
|
query = f"{query} loc:{region}"
|
||||||
|
|
||||||
|
args: dict[str, str | int | None] = {
|
||||||
|
"q": query,
|
||||||
|
"b": params["pageno"] * results_per_page + 1, # page is 1-indexed
|
||||||
|
"pz": results_per_page,
|
||||||
|
}
|
||||||
|
|
||||||
|
if params["time_range"]:
|
||||||
|
args["fr2"] = "time"
|
||||||
|
args["age"] = params["time_range"]
|
||||||
|
else:
|
||||||
|
args["fr2"] = "sb-top-search"
|
||||||
|
|
||||||
|
params["cookies"]["sB"] = f"vm={safesearch_map[params['safesearch']]}"
|
||||||
|
params["url"] = f"{base_url}/aol/{search_type}?{urlencode(args)}"
|
||||||
|
logger.debug(params)
|
||||||
|
|
||||||
|
|
||||||
|
def _deobfuscate_url(obfuscated_url: str) -> str | None:
|
||||||
|
# URL looks like "https://search.aol.com/click/_ylt=AwjFSDjd;_ylu=JfsdjDFd/RV=2/RE=1774058166/RO=10/RU=https%3a%2f%2fen.wikipedia.org%2fwiki%2fTree/RK=0/RS=BP2CqeMLjscg4n8cTmuddlEQA2I-" # pylint: disable=line-too-long
|
||||||
|
if not obfuscated_url:
|
||||||
|
return None
|
||||||
|
|
||||||
|
for part in obfuscated_url.split("/"):
|
||||||
|
if part.startswith("RU="):
|
||||||
|
return unquote_plus(part[3:])
|
||||||
|
# pattern for de-obfuscating URL not found, fall back to Yahoo's tracking link
|
||||||
|
return obfuscated_url
|
||||||
|
|
||||||
|
|
||||||
|
def _general_results(doc: html.HtmlElement) -> EngineResults:
|
||||||
|
res = EngineResults()
|
||||||
|
|
||||||
|
for result in eval_xpath_list(doc, "//div[@id='web']//ol/li[not(contains(@class, 'first'))]"):
|
||||||
|
obfuscated_url = extract_text(eval_xpath(result, ".//h3/a/@href"))
|
||||||
|
if not obfuscated_url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = _deobfuscate_url(obfuscated_url)
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
res.add(
|
||||||
|
res.types.MainResult(
|
||||||
|
url=url,
|
||||||
|
title=extract_text(eval_xpath(result, ".//h3/a")) or "",
|
||||||
|
content=extract_text(eval_xpath(result, ".//div[contains(@class, 'compText')]")) or "",
|
||||||
|
thumbnail=extract_text(eval_xpath(result, ".//a[contains(@class, 'thm')]/img/@data-src")) or "",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def _video_results(doc: html.HtmlElement) -> EngineResults:
|
||||||
|
res = EngineResults()
|
||||||
|
|
||||||
|
for result in eval_xpath_list(doc, "//div[contains(@class, 'results')]//ol/li"):
|
||||||
|
obfuscated_url = extract_text(eval_xpath(result, ".//a/@href"))
|
||||||
|
if not obfuscated_url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = _deobfuscate_url(obfuscated_url)
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
published_date_raw = extract_text(eval_xpath(result, ".//div[contains(@class, 'v-age')]"))
|
||||||
|
try:
|
||||||
|
published_date = parser.parse(published_date_raw or "")
|
||||||
|
except parser.ParserError:
|
||||||
|
published_date = None
|
||||||
|
|
||||||
|
res.add(
|
||||||
|
res.types.LegacyResult(
|
||||||
|
{
|
||||||
|
"template": "videos.html",
|
||||||
|
"url": url,
|
||||||
|
"title": extract_text(eval_xpath(result, ".//h3")),
|
||||||
|
"content": extract_text(eval_xpath(result, ".//div[contains(@class, 'compText')]")),
|
||||||
|
"thumbnail": extract_text(eval_xpath(result, ".//img[contains(@class, 'thm')]/@src")),
|
||||||
|
"length": extract_text(eval_xpath(result, ".//span[contains(@class, 'v-time')]")),
|
||||||
|
"publishedDate": published_date,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def _image_results(doc: html.HtmlElement) -> EngineResults:
|
||||||
|
res = EngineResults()
|
||||||
|
|
||||||
|
for result in eval_xpath_list(doc, "//section[@id='results']//ul/li"):
|
||||||
|
obfuscated_url = extract_text(eval_xpath(result, "./a/@href"))
|
||||||
|
if not obfuscated_url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = _deobfuscate_url(obfuscated_url)
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
res.add(
|
||||||
|
res.types.LegacyResult(
|
||||||
|
{
|
||||||
|
"template": "images.html",
|
||||||
|
# results don't have an extra URL, only the image source
|
||||||
|
"url": url,
|
||||||
|
"title": extract_text(eval_xpath(result, ".//a/@aria-label")),
|
||||||
|
"thumbnail_src": extract_text(eval_xpath(result, ".//img/@src")),
|
||||||
|
"img_src": url,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def response(resp: "SXNG_Response") -> EngineResults:
|
||||||
|
doc = html.fromstring(resp.text)
|
||||||
|
|
||||||
|
match search_type:
|
||||||
|
case "search":
|
||||||
|
results = _general_results(doc)
|
||||||
|
case "image":
|
||||||
|
results = _image_results(doc)
|
||||||
|
case "video":
|
||||||
|
results = _video_results(doc)
|
||||||
|
case _:
|
||||||
|
raise ValueError("unsupported search type")
|
||||||
|
|
||||||
|
for suggestion in eval_xpath_list(doc, ".//ol[contains(@class, 'searchRightBottom')]//table//a"):
|
||||||
|
results.add(results.types.LegacyResult({"suggestion": extract_text(suggestion)}))
|
||||||
|
|
||||||
|
return results
|
||||||
@@ -421,6 +421,24 @@ engines:
|
|||||||
shortcut: conda
|
shortcut: conda
|
||||||
disabled: true
|
disabled: true
|
||||||
|
|
||||||
|
- name: aol
|
||||||
|
engine: aol
|
||||||
|
search_type: search
|
||||||
|
categories: [general]
|
||||||
|
shortcut: aol
|
||||||
|
|
||||||
|
- name: aol images
|
||||||
|
engine: aol
|
||||||
|
search_type: image
|
||||||
|
categories: [images]
|
||||||
|
shortcut: aoli
|
||||||
|
|
||||||
|
- name: aol videos
|
||||||
|
engine: aol
|
||||||
|
search_type: video
|
||||||
|
categories: [videos]
|
||||||
|
shortcut: aolv
|
||||||
|
|
||||||
- name: arch linux wiki
|
- name: arch linux wiki
|
||||||
engine: archlinux
|
engine: archlinux
|
||||||
shortcut: al
|
shortcut: al
|
||||||
|
|||||||
Reference in New Issue
Block a user