Notes: - Safesearch doesn't seem to work properly? - In theory multiple languages are supported, but even in the web UI, they don't work properly - Possibly, we could cache the request hashes (h query parameter), I'm not sure if it ever changes
96 lines
3.2 KiB
Python
96 lines
3.2 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
"""GMX (general)
|
|
|
|
It's unclear which index it uses, the results were the most similar to Google's.
|
|
|
|
In theory it supports multiple languages, but even if changing the region on their website,
|
|
most of the results are still in English."""
|
|
|
|
import time
|
|
import typing as t
|
|
|
|
from urllib.parse import urlencode
|
|
|
|
from searx.result_types import EngineResults
|
|
from searx.extended_types import SXNG_Response
|
|
from searx.utils import extr, gen_useragent, html_to_text
|
|
from searx.network import get
|
|
|
|
if t.TYPE_CHECKING:
|
|
from searx.search.processors import OnlineParams
|
|
|
|
about = {
|
|
"website": "https://search.gmx.com",
|
|
"official_api_documentation": None,
|
|
"use_official_api": False,
|
|
"require_api_key": False,
|
|
"results": "JSON",
|
|
}
|
|
|
|
base_url = "https://search.gmx.com" # alternatively: search.gmx.net
|
|
categories = ["general"]
|
|
|
|
paging = True
|
|
safesearch = True
|
|
time_range_support = True
|
|
|
|
time_range_map = {"day": "d", "week": "w", "month": "m", "year": "y"}
|
|
|
|
|
|
def _get_page_hash(query: str, page: int, headers: dict[str, str]) -> str:
|
|
resp = get(f"{base_url}/web/result?q={query}&page={page}", headers=headers)
|
|
|
|
# the text we search for looks like:
|
|
# load("/desk?lang="+eV.p.param['hl']+"&q="+eV['p']['q_encode']+"&page=5&h=aa45603&t=177582576&origin=web&comp=web_serp_pag&p=gmx-com&sp=&lr="+eV.p.param['lr0']+"&mkt="+eV.p.param['mkt0']+"&family="+eV.p.param['familyFilter']+"&fcons="+eV.p.perm.fCons,"google", "eMMO", "eMH","eMP"); # pylint: disable=line-too-long
|
|
return extr(resp.text, "&h=", "&t=")
|
|
|
|
|
|
def request(query: str, params: 'OnlineParams'):
|
|
# the headers have to be as close to normal browsers as possible, otherwise you get rate-limited quickly
|
|
# the user agent for loading the hash and requesting the results has to be the same
|
|
headers = {
|
|
"User-Agent": gen_useragent(),
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Connection": "keep-alive",
|
|
"Referer": base_url,
|
|
}
|
|
|
|
# the "h" parameter has to be set to the current time in seconds with the last digit removed
|
|
# e.g., if the current time is 1775829848, h has to be 177582984
|
|
now = int(time.time() / 10)
|
|
|
|
# the page hash depends on the query and page number
|
|
page_hash = _get_page_hash(query, params["pageno"], headers)
|
|
# the headers have to match the ones from the previous request
|
|
|
|
args = {"lang": "en", "q": query, "page": params["pageno"], "h": page_hash, "t": now}
|
|
if params["safesearch"]:
|
|
args["family"] = True
|
|
if params.get("time_range"):
|
|
args["time"] = time_range_map[params["time_range"]]
|
|
|
|
params["url"] = f"{base_url}/desk?{urlencode(args)}"
|
|
|
|
params["headers"].update(headers)
|
|
|
|
|
|
def response(resp: 'SXNG_Response') -> EngineResults:
|
|
res = EngineResults()
|
|
|
|
results = resp.json()["results"]
|
|
|
|
for suggestion in results["rs"]:
|
|
res.add(res.types.LegacyResult({"suggestion": suggestion["t"]}))
|
|
|
|
for result in results["hits"]:
|
|
res.add(
|
|
res.types.MainResult(
|
|
url=result["u"],
|
|
title=html_to_text(result["t"]),
|
|
content=html_to_text(result["s"]),
|
|
)
|
|
)
|
|
|
|
return res
|