searxng/searx/engines/gmx.py

# SPDX-License-Identifier: AGPL-3.0-or-later
"""GMX (general)

It's unclear which index it uses, the results were the most similar to Google's.

In theory it supports multiple languages, but even if changing the region on their website,
most of the results are still in English."""

import time
import typing as t

from urllib.parse import urlencode

from searx.result_types import EngineResults
from searx.extended_types import SXNG_Response
from searx.utils import extr, gen_useragent, html_to_text
from searx.network import get

if t.TYPE_CHECKING:
    from searx.search.processors import OnlineParams

about = {
    "website": "https://search.gmx.com",
    "official_api_documentation": None,
    "use_official_api": False,
    "require_api_key": False,
    "results": "JSON",
}

base_url = "https://search.gmx.com"  # alternatively: search.gmx.net
categories = ["general"]

paging = True
safesearch = True
time_range_support = True

time_range_map = {"day": "d", "week": "w", "month": "m", "year": "y"}


def _get_page_hash(query: str, page: int, headers: dict[str, str]) -> str:
    resp = get(f"{base_url}/web/result?q={query}&page={page}", headers=headers)

    # the text we search for looks like:
    # load("/desk?lang="+eV.p.param['hl']+"&q="+eV['p']['q_encode']+"&page=5&h=aa45603&t=177582576&origin=web&comp=web_serp_pag&p=gmx-com&sp=&lr="+eV.p.param['lr0']+"&mkt="+eV.p.param['mkt0']+"&family="+eV.p.param['familyFilter']+"&fcons="+eV.p.perm.fCons,"google", "eMMO", "eMH","eMP");  # pylint: disable=line-too-long
    return extr(resp.text, "&h=", "&t=")


def request(query: str, params: 'OnlineParams'):
    # the headers have to be as close to normal browsers as possible, otherwise you get rate-limited quickly
    # the user agent for loading the hash and requesting the results has to be the same
    headers = {
        "User-Agent": gen_useragent(),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
        "Referer": base_url,
    }

    # the "h" parameter has to be set to the current time in seconds with the last digit removed
    # e.g., if the current time is 1775829848, h has to be 177582984
    now = int(time.time() / 10)

    # the page hash depends on the query and page number
    page_hash = _get_page_hash(query, params["pageno"], headers)
    # the headers have to match the ones from the previous request

    args = {"lang": "en", "q": query, "page": params["pageno"], "h": page_hash, "t": now}
    if params["safesearch"]:
        args["family"] = True
    if params.get("time_range"):
        args["time"] = time_range_map[params["time_range"]]

    params["url"] = f"{base_url}/desk?{urlencode(args)}"

    params["headers"].update(headers)


def response(resp: 'SXNG_Response') -> EngineResults:
    res = EngineResults()

    results = resp.json()["results"]

    for suggestion in results["rs"]:
        res.add(res.types.LegacyResult({"suggestion": suggestion["t"]}))

    for result in results["hits"]:
        res.add(
            res.types.MainResult(
                url=result["u"],
                title=html_to_text(result["t"]),
                content=html_to_text(result["s"]),
            )
        )

    return res