Files
searxng/searx/engines/sogou.py

140 lines
4.0 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Sogou search engine for searxng"""
import re
from datetime import datetime
from urllib.parse import urlencode
from lxml import html
from searx.exceptions import SearxEngineCaptchaException
from searx.utils import extract_text
# Metadata
about = {
"website": "https://www.sogou.com/",
"wikidata_id": "Q7554565",
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
"language": "zh",
}
# Engine Configuration
categories = ["general"]
paging = True
time_range_support = True
time_range_dict = {
"day": "inttime_day",
"week": "inttime_week",
"month": "inttime_month",
"year": "inttime_year",
}
# Base URL
base_url = "https://www.sogou.com"
def request(query, params):
query_params = {
"query": query,
"page": params["pageno"],
}
if time_range_dict.get(params["time_range"]):
query_params["s_from"] = time_range_dict.get(params["time_range"])
query_params["tsn"] = 1
params["allow_redirects"] = False
params["url"] = f"{base_url}/web?{urlencode(query_params)}"
return params
def response(resp):
if (
resp.status_code == 302
and resp.next_request is not None
and str(resp.next_request.url).startswith("http://www.sogou.com/antispider")
):
raise SearxEngineCaptchaException()
dom = html.fromstring(resp.text)
results = []
# pylint: disable=line-too-long
for item in dom.xpath(
'//div[contains(@class, "rb")] | //div[contains(@class, "vrwrap") and not(.//div[contains(@class, "special-wrap")])]'
):
item_html = html.tostring(item, encoding="unicode")
if item.xpath('.//h3[@class="pt"]/a'):
result = _parse_results(item, item_html)
elif item.xpath('.//h3[contains(@class, "vr-title")]/a'):
result = _parse_results_with_image(item, item_html)
else:
continue
if result["title"] and result["url"]:
results.append(result)
return results
def _extract_url(url, item_html):
if url and url.startswith("/link?url="):
match = re.search(r'data-url="([^"]+)"', item_html)
if match:
return match.group(1)
return f"{base_url}{url}"
return url
def _parse_date(text):
if text:
text = text.strip().lstrip("-").strip()
date_match = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", text)
if date_match:
try:
return datetime.strptime(date_match.group(1), "%Y-%m-%d")
except (ValueError, TypeError):
pass
return None
def _parse_results(item, item_html):
title = extract_text(item.xpath('.//h3[@class="pt"]/a'))
content = extract_text(item.xpath('.//div[@class="ft"]'))
url = _extract_url(extract_text(item.xpath('.//h3[@class="pt"]/a/@href')), item_html)
publishedDate = _parse_date(extract_text(item.xpath(".//cite")))
return {
"title": title,
"url": url,
"content": content,
"publishedDate": publishedDate,
}
def _parse_results_with_image(item, item_html):
title = extract_text(item.xpath('.//h3[contains(@class, "vr-title")]/a'))
content = extract_text(item.xpath('.//div[contains(@class, "attribute-centent")]'))
if not content:
content = extract_text(item.xpath('.//div[contains(@class, "fz-mid space-txt")]'))
url = _extract_url(extract_text(item.xpath('.//h3[contains(@class, "vr-title")]/a/@href')), item_html)
publishedDate = _parse_date(extract_text(item.xpath('.//span[@class="cite-date"]')))
thumbnail = None
try:
thumbnail_src = extract_text(item.xpath('.//div[contains(@class, "img-layout")]//img/@src'))
if thumbnail_src:
thumbnail = thumbnail_src.replace("http://", "https://")
except (ValueError, TypeError):
pass
return {
"title": title,
"url": url,
"content": content,
"publishedDate": publishedDate,
"thumbnail": thumbnail,
}