Files
searxng/searx/engines/unsplash.py
Bnyro fe1d6d9c48 [fix] unsplash: fix engine due to anubis bot blocking (#5907)
Unsplash started using [Anubis](https://anubis.techaro.lol/)
for blocking crawlers. Therefore, requests using common
user agents (e.g. Firefox, Chrome) must pass a JavaScript
challenge.

However, other user agents seem unaffected for now, hence
settings the UA to something different does still work.
2026-03-25 20:36:17 +01:00

64 lines
1.9 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Unsplash"""
from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl
from json import loads
from searx.utils import searxng_useragent
# about
about = {
"website": 'https://unsplash.com',
"wikidata_id": 'Q28233552',
"official_api_documentation": 'https://unsplash.com/developers',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
base_url = 'https://unsplash.com/'
search_url = base_url + 'napi/search/photos?'
categories = ['images']
page_size = 20
paging = True
def clean_url(url):
parsed = urlparse(url)
query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k != 'ixid']
return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, urlencode(query), parsed.fragment))
def request(query, params):
params['url'] = search_url + urlencode({'query': query, 'page': params['pageno'], 'per_page': page_size})
logger.debug("query_url --> %s", params['url'])
# common user agents (e.g. Firefox, Chrome) are blocked
# by Anubis (https://anubis.techaro.lol/)
# so we pass the searxng user agent instead, which is not
# commonly used by crawlers and hence not blocked
params["headers"]["User-Agent"] = searxng_useragent()
return params
def response(resp):
results = []
json_data = loads(resp.text)
if 'results' in json_data:
for result in json_data['results']:
results.append(
{
'template': 'images.html',
'url': clean_url(result['links']['html']),
'thumbnail_src': clean_url(result['urls']['thumb']),
'img_src': clean_url(result['urls']['regular']),
'title': result.get('alt_description') or 'unknown',
'content': result.get('description') or '',
}
)
return results