2021-03-26 12:22:49 +01:00
|
|
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
2025-03-16 07:00:47 +00:00
|
|
|
|
"""CORE_ (COnnecting REpositories) provides a comprehensive bibliographic
|
|
|
|
|
|
database of the world’s scholarly literature, collecting and indexing
|
|
|
|
|
|
research from repositories and journals.
|
|
|
|
|
|
|
|
|
|
|
|
.. _CORE: https://core.ac.uk/about
|
|
|
|
|
|
|
2025-09-10 16:34:30 +02:00
|
|
|
|
.. note::
|
|
|
|
|
|
|
|
|
|
|
|
The CORE engine requires an :py:obj:`API key <api_key>`.
|
|
|
|
|
|
|
2025-03-16 07:00:47 +00:00
|
|
|
|
.. _core engine config:
|
|
|
|
|
|
|
|
|
|
|
|
Configuration
|
|
|
|
|
|
=============
|
|
|
|
|
|
|
|
|
|
|
|
The engine has the following additional settings:
|
|
|
|
|
|
|
|
|
|
|
|
- :py:obj:`api_key`
|
|
|
|
|
|
|
|
|
|
|
|
.. code:: yaml
|
|
|
|
|
|
|
|
|
|
|
|
- name: core.ac.uk
|
|
|
|
|
|
api_key: "..."
|
2025-09-10 16:34:30 +02:00
|
|
|
|
inactive: false
|
2025-03-16 07:00:47 +00:00
|
|
|
|
|
|
|
|
|
|
Implementations
|
|
|
|
|
|
===============
|
2021-03-26 12:22:49 +01:00
|
|
|
|
|
|
|
|
|
|
"""
|
2025-09-10 16:34:30 +02:00
|
|
|
|
|
|
|
|
|
|
import typing as t
|
2021-03-26 12:22:49 +01:00
|
|
|
|
|
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
from urllib.parse import urlencode
|
|
|
|
|
|
|
2025-09-10 16:34:30 +02:00
|
|
|
|
from searx.result_types import EngineResults
|
|
|
|
|
|
|
|
|
|
|
|
if t.TYPE_CHECKING:
|
|
|
|
|
|
from searx.extended_types import SXNG_Response
|
|
|
|
|
|
from searx.search.processors import OnlineParams
|
|
|
|
|
|
|
2021-04-04 12:48:24 +02:00
|
|
|
|
|
2021-03-26 12:22:49 +01:00
|
|
|
|
about = {
|
2025-09-10 16:34:30 +02:00
|
|
|
|
"website": "https://core.ac.uk",
|
|
|
|
|
|
"wikidata_id": "Q22661180",
|
|
|
|
|
|
"official_api_documentation": "https://api.core.ac.uk/docs/v3",
|
2021-03-26 12:22:49 +01:00
|
|
|
|
"use_official_api": True,
|
|
|
|
|
|
"require_api_key": True,
|
2025-09-10 16:34:30 +02:00
|
|
|
|
"results": "JSON",
|
2021-03-26 12:22:49 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-09-10 16:34:30 +02:00
|
|
|
|
api_key = ""
|
2025-03-16 07:00:47 +00:00
|
|
|
|
"""For an API key register at https://core.ac.uk/services/api and insert
|
|
|
|
|
|
the API key in the engine :ref:`core engine config`."""
|
|
|
|
|
|
|
2025-09-10 16:34:30 +02:00
|
|
|
|
categories = ["science", "scientific publications"]
|
2021-03-26 12:22:49 +01:00
|
|
|
|
paging = True
|
2021-04-04 12:48:24 +02:00
|
|
|
|
nb_per_page = 10
|
2025-09-10 16:34:30 +02:00
|
|
|
|
base_url = "https://api.core.ac.uk/v3/search/works/"
|
2021-03-26 12:22:49 +01:00
|
|
|
|
|
2021-12-27 09:26:22 +01:00
|
|
|
|
|
2025-09-10 16:34:30 +02:00
|
|
|
|
def setup(engine_settings: dict[str, t.Any]) -> bool:
|
|
|
|
|
|
"""Initialization of the CORE_ engine, checks whether the :py:obj:`api_key`
|
|
|
|
|
|
is set, otherwise the engine is inactive.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
key: str = engine_settings.get("api_key", "")
|
|
|
|
|
|
if key and key not in ("unset", "unknown", "..."):
|
|
|
|
|
|
return True
|
|
|
|
|
|
logger.error("CORE's API key is not set or invalid.")
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def request(query: str, params: "OnlineParams") -> None:
|
2021-03-26 12:22:49 +01:00
|
|
|
|
|
2025-03-16 07:00:47 +00:00
|
|
|
|
# API v3 uses different parameters
|
|
|
|
|
|
search_params = {
|
2025-09-10 16:34:30 +02:00
|
|
|
|
"q": query,
|
|
|
|
|
|
"offset": (params["pageno"] - 1) * nb_per_page,
|
|
|
|
|
|
"limit": nb_per_page,
|
|
|
|
|
|
"sort": "relevance",
|
2025-03-16 07:00:47 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-09-10 16:34:30 +02:00
|
|
|
|
params["url"] = base_url + "?" + urlencode(search_params)
|
|
|
|
|
|
params["headers"] = {"Authorization": f"Bearer {api_key}"}
|
2021-03-26 12:22:49 +01:00
|
|
|
|
|
2021-12-27 09:26:22 +01:00
|
|
|
|
|
2025-09-10 16:34:30 +02:00
|
|
|
|
def response(resp: "SXNG_Response") -> EngineResults:
|
|
|
|
|
|
# pylint: disable=too-many-branches
|
|
|
|
|
|
res = EngineResults()
|
2022-09-24 13:17:01 +02:00
|
|
|
|
json_data = resp.json()
|
2021-04-04 12:48:24 +02:00
|
|
|
|
|
2025-09-10 16:34:30 +02:00
|
|
|
|
for result in json_data.get("results", []):
|
2025-03-16 07:00:47 +00:00
|
|
|
|
# Get title
|
2025-09-10 16:34:30 +02:00
|
|
|
|
if not result.get("title"):
|
2025-03-16 07:00:47 +00:00
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# Get URL - try different options
|
2025-09-10 16:34:30 +02:00
|
|
|
|
url: str | None = None
|
2022-09-24 14:26:07 +02:00
|
|
|
|
|
2025-03-16 07:00:47 +00:00
|
|
|
|
# Try DOI first
|
2025-09-10 16:34:30 +02:00
|
|
|
|
doi: str = result.get("doi")
|
2025-03-16 07:00:47 +00:00
|
|
|
|
if doi:
|
2025-09-10 16:34:30 +02:00
|
|
|
|
url = f"https://doi.org/{doi}"
|
2025-03-16 07:00:47 +00:00
|
|
|
|
|
2025-09-10 16:34:30 +02:00
|
|
|
|
if url is None and result.get("doi"):
|
2022-09-24 14:26:07 +02:00
|
|
|
|
# use the DOI reference
|
2025-09-10 16:34:30 +02:00
|
|
|
|
url = "https://doi.org/" + str(result["doi"])
|
|
|
|
|
|
elif result.get("id"):
|
|
|
|
|
|
url = "https://core.ac.uk/works/" + str(result["id"])
|
|
|
|
|
|
elif result.get("downloadUrl"):
|
|
|
|
|
|
url = result["downloadUrl"]
|
|
|
|
|
|
elif result.get("sourceFulltextUrls"):
|
|
|
|
|
|
url = result["sourceFulltextUrls"]
|
2025-03-16 07:00:47 +00:00
|
|
|
|
else:
|
[fix] ERROR searx.engines.core.ac.uk: list index out of range
Some result items from core.ac.uk do not have an URL::
Traceback (most recent call last):
File "searx/search/processors/online.py", line 154, in search
search_results = self._search_basic(query, params)
File "searx/search/processors/online.py", line 142, in _search_basic
return self.engine.response(response)
File "SearXNG/searx/engines/core.py", line 73, in response
'url': source['urls'][0].replace('http://', 'https://', 1),
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2022-09-24 11:54:12 +02:00
|
|
|
|
continue
|
|
|
|
|
|
|
2025-03-16 07:00:47 +00:00
|
|
|
|
# Published date
|
|
|
|
|
|
published_date = None
|
2022-09-24 13:17:01 +02:00
|
|
|
|
|
2025-09-10 16:34:30 +02:00
|
|
|
|
raw_date = result.get("publishedDate") or result.get("depositedDate")
|
2025-03-16 07:00:47 +00:00
|
|
|
|
if raw_date:
|
|
|
|
|
|
try:
|
2025-09-10 16:34:30 +02:00
|
|
|
|
published_date = datetime.fromisoformat(result["publishedDate"].replace("Z", "+00:00"))
|
2025-03-16 07:00:47 +00:00
|
|
|
|
except (ValueError, AttributeError):
|
|
|
|
|
|
pass
|
2022-09-24 13:17:01 +02:00
|
|
|
|
|
2025-03-16 07:00:47 +00:00
|
|
|
|
# Handle journals
|
|
|
|
|
|
journals = []
|
2025-09-10 16:34:30 +02:00
|
|
|
|
if result.get("journals"):
|
|
|
|
|
|
journals = [j.get("title") for j in result["journals"] if j.get("title")]
|
2025-03-16 07:00:47 +00:00
|
|
|
|
|
|
|
|
|
|
# Handle publisher
|
2025-09-10 16:34:30 +02:00
|
|
|
|
publisher = result.get("publisher", "").strip("'")
|
2025-03-16 07:00:47 +00:00
|
|
|
|
|
|
|
|
|
|
# Handle authors
|
2025-09-10 16:34:30 +02:00
|
|
|
|
authors: set[str] = set()
|
|
|
|
|
|
for i in result.get("authors", []):
|
|
|
|
|
|
name: str | None = i.get("name")
|
2025-03-16 07:00:47 +00:00
|
|
|
|
if name:
|
|
|
|
|
|
authors.add(name)
|
2021-04-04 12:48:24 +02:00
|
|
|
|
|
2025-09-10 16:34:30 +02:00
|
|
|
|
res.add(
|
|
|
|
|
|
res.types.Paper(
|
|
|
|
|
|
title=result.get("title"),
|
|
|
|
|
|
url=url,
|
|
|
|
|
|
content=result.get("fullText", "") or "",
|
|
|
|
|
|
tags=result.get("fieldOfStudy", []),
|
|
|
|
|
|
publishedDate=published_date,
|
|
|
|
|
|
type=result.get("documentType", "") or "",
|
|
|
|
|
|
authors=authors,
|
|
|
|
|
|
editor=", ".join(result.get("contributors", [])),
|
|
|
|
|
|
publisher=publisher,
|
|
|
|
|
|
journal=", ".join(journals),
|
|
|
|
|
|
doi=result.get("doi"),
|
|
|
|
|
|
pdf_url=result.get("downloadUrl", {}) or result.get("sourceFulltextUrls", {}),
|
|
|
|
|
|
)
|
2021-12-27 09:26:22 +01:00
|
|
|
|
)
|
2021-03-26 12:22:49 +01:00
|
|
|
|
|
2025-09-10 16:34:30 +02:00
|
|
|
|
return res
|