diff --git a/requirements.txt b/requirements.txt index c450ed85d..8e841f941 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,6 @@ httpx-socks[asyncio]==0.10.0 sniffio==1.3.1 valkey==6.1.1 markdown-it-py==3.0.0 -fasttext-predict==0.9.2.4 tomli==2.4.0; python_version < '3.11' msgspec==0.20.0 typer==0.24.1 diff --git a/searx/utils.py b/searx/utils.py index 8d68fcaae..6226bef08 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -25,16 +25,11 @@ from lxml.etree import XPath, XPathError, XPathSyntaxError from lxml.etree import ElementBase, _Element # pyright: ignore[reportPrivateUsage] from searx import settings -from searx.data import USER_AGENTS, data_dir, gsa_useragents_loader +from searx.data import USER_AGENTS, gsa_useragents_loader from searx.version import VERSION_TAG -from searx.sxng_locales import sxng_locales from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException from searx import logger -if t.TYPE_CHECKING: - import fasttext.FastText # type: ignore - - logger = logger.getChild('utils') XPathSpecType: t.TypeAlias = str | XPath @@ -61,12 +56,6 @@ _JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu' _XPATH_CACHE: dict[str, XPath] = {} _LANG_TO_LC_CACHE: dict[str, dict[str, str]] = {} -_FASTTEXT_MODEL: "fasttext.FastText._FastText | None" = None # pyright: ignore[reportPrivateUsage] -"""fasttext model to predict language of a search term""" - -SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales]) -"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`).""" - class _NotSetClass: # pylint: disable=too-few-public-methods """Internal class for this module, do not create instance of this class. @@ -610,17 +599,6 @@ def eval_xpath_getindex( return default -def _get_fasttext_model() -> "fasttext.FastText._FastText": # pyright: ignore[reportPrivateUsage] - global _FASTTEXT_MODEL # pylint: disable=global-statement - if _FASTTEXT_MODEL is None: - import fasttext # pylint: disable=import-outside-toplevel - - # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model. - fasttext.FastText.eprint = lambda x: None # type: ignore - _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz')) # type: ignore - return _FASTTEXT_MODEL - - def get_embeded_stream_url(url: str): """ Converts a standard video URL into its embed format. Supported services include Youtube, @@ -683,77 +661,6 @@ def get_embeded_stream_url(url: str): return iframe_src -def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> str | None: - """Detect the language of the ``text`` parameter. - - :param str text: The string whose language is to be detected. - - :param float threshold: Threshold filters the returned labels by a threshold - on probability. A choice of 0.3 will return labels with at least 0.3 - probability. - - :param bool only_search_languages: If ``True``, returns only supported - SearXNG search languages. see :py:obj:`searx.languages` - - :rtype: str, None - :returns: - The detected language code or ``None``. See below. - - :raises ValueError: If ``text`` is not a string. - - The language detection is done by using `a fork`_ of the fastText_ library - (`python fasttext`_). fastText_ distributes the `language identification - model`_, for reference: - - - `FastText.zip: Compressing text classification models`_ - - `Bag of Tricks for Efficient Text Classification`_ - - The `language identification model`_ support the language codes - (ISO-639-3):: - - af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs - bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es - et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia - id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li - lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah - nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru - rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl - tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh - - By using ``only_search_languages=True`` the `language identification model`_ - is harmonized with the SearXNG's language (locale) model. General - conditions of SearXNG's locale model are: - - a. SearXNG's locale of a query is passed to the - :py:obj:`searx.locales.get_engine_locale` to get a language and/or region - code that is used by an engine. - - b. Most of SearXNG's engines do not support all the languages from `language - identification model`_ and there is also a discrepancy in the ISO-639-3 - (fasttext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the - locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant`` - (``zh_Hans``) while the `language identification model`_ reduce both to - ``zh``. - - .. _a fork: https://github.com/searxng/fasttext-predict - .. _fastText: https://fasttext.cc/ - .. _python fasttext: https://pypi.org/project/fasttext/ - .. _language identification model: https://fasttext.cc/docs/en/language-identification.html - .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759 - .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651 - - """ - if not isinstance(text, str): - raise ValueError('text must a str') # pyright: ignore[reportUnreachable] - r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold) # type: ignore - if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0: # type: ignore - language = r[0][0].split('__label__')[1] # type: ignore - if only_search_languages and language not in SEARCH_LANGUAGE_CODES: - return None - return language # type: ignore - return None - - def _j2p_process_escape(match: re.Match[str]) -> str: # deal with ECMA escape characters _escape = match.group(1) or match.group(2) diff --git a/searx/webadapter.py b/searx/webadapter.py index a05d1627c..f1924dd4c 100644 --- a/searx/webadapter.py +++ b/searx/webadapter.py @@ -9,7 +9,6 @@ from searx.query import RawTextQuery from searx.engines import categories, engines from searx.search.models import SearchQuery, EngineRef from searx.preferences import Preferences, is_locked -from searx.utils import detect_language # remove duplicate queries. @@ -233,9 +232,7 @@ def get_search_query_from_webapp( 4. string with the *selected locale* of the query About language/locale: if the client selects the alias ``auto`` the - ``SearchQuery`` object is build up by the :py:obj:`detected language - `. If language recognition does not have a - match the language preferred by the :py:obj:`Preferences.client` is used. + language preferred by the :py:obj:`Preferences.client` is used. If client does not have a preference, the default ``all`` is used. The *selected locale* in the tuple always represents the selected @@ -267,8 +264,7 @@ def get_search_query_from_webapp( selected_locale = query_lang if query_lang == 'auto': - query_lang = detect_language(query, threshold=0.8, only_search_languages=True) - query_lang = query_lang or preferences.client.locale_tag or 'all' + query_lang = preferences.client.locale_tag or 'all' if not is_locked('categories') and raw_text_query.specific: # if engines are calculated from query, diff --git a/searxng_extra/update/update_engine_descriptions.py b/searxng_extra/update/update_engine_descriptions.py index 769af128a..d200a70bb 100755 --- a/searxng_extra/update/update_engine_descriptions.py +++ b/searxng_extra/update/update_engine_descriptions.py @@ -19,7 +19,7 @@ from searx.engines import wikidata, set_loggers from searx.utils import extract_text, searxng_useragent from searx.locales import LOCALE_NAMES, locales_initialize, match_locale from searx import searx_dir -from searx.utils import gen_useragent, detect_language +from searx.utils import gen_useragent import searx.search import searx.network from searx.data import data_dir @@ -169,7 +169,7 @@ def get_website_description(url, lang1, lang2=None): lang = extract_text(html.xpath('/html/@lang')) if lang is None and len(lang1) > 0: lang = lang1 - lang = detect_language(description) or lang or 'en' + lang = lang or 'en' lang = lang.split('_')[0] lang = lang.split('-')[0] return (lang, description) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 01056df74..271ec629b 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -194,27 +194,3 @@ class TestXPathUtils(SearxTestCase): # pylint: disable=missing-class-docstring with self.assertRaises(SearxEngineXPathException) as context: utils.eval_xpath_getindex(doc, 'count(//i)', 1) self.assertEqual(context.exception.message, 'the result is not a list') - - def test_detect_language(self): - # make sure new line are not an issue - # fasttext.predict('') does not accept new line. - l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog') - self.assertEqual(l, 'en') - - l = utils.detect_language( - 'いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす' - ) - self.assertEqual(l, 'ja') - - l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.') - self.assertEqual(l, 'tr') - - l = utils.detect_language('') - self.assertIsNone(l) - - # mix languages --> None - l = utils.detect_language('The いろはにほへと Pijamalı') - self.assertIsNone(l) - - with self.assertRaises(ValueError): - utils.detect_language(None) # type: ignore