diff --git a/dateparser/calendars/jalali_parser.py b/dateparser/calendars/jalali_parser.py index ef437d5fb..602493ce4 100644 --- a/dateparser/calendars/jalali_parser.py +++ b/dateparser/calendars/jalali_parser.py @@ -1,5 +1,4 @@ import re -from collections import OrderedDict from functools import reduce from convertdate import persian @@ -40,35 +39,31 @@ class jalali_parser(non_gregorian_parser): "۹": 9, } - _months = OrderedDict( - [ - # pinglish : (persian literals, month index, number of days) - ("Farvardin", (1, 31, ["فروردین"])), - ("Ordibehesht", (2, 31, ["اردیبهشت"])), - ("Khordad", (3, 31, ["خرداد"])), - ("Tir", (4, 31, ["تیر"])), - ("Mordad", (5, 31, ["امرداد", "مرداد"])), - ("Shahrivar", (6, 31, ["شهریور", "شهريور"])), - ("Mehr", (7, 30, ["مهر"])), - ("Aban", (8, 30, ["آبان"])), - ("Azar", (9, 30, ["آذر"])), - ("Dey", (10, 30, ["دی"])), - ("Bahman", (11, 30, ["بهمن", "بهن"])), - ("Esfand", (12, 29, ["اسفند"])), - ] - ) - - _weekdays = OrderedDict( - [ - ("Sunday", ["یکشنبه"]), - ("Monday", ["دوشنبه"]), - ("Tuesday", ["سهشنبه", "سه شنبه"]), - ("Wednesday", ["چهارشنبه", "چهار شنبه"]), - ("Thursday", ["پنجشنبه", "پنج شنبه"]), - ("Friday", ["جمعه"]), - ("Saturday", ["روز شنبه", "شنبه"]), - ] - ) + _months = { + # pinglish : (persian literals, month index, number of days) + "Farvardin": (1, 31, ["فروردین"]), + "Ordibehesht": (2, 31, ["اردیبهشت"]), + "Khordad": (3, 31, ["خرداد"]), + "Tir": (4, 31, ["تیر"]), + "Mordad": (5, 31, ["امرداد", "مرداد"]), + "Shahrivar": (6, 31, ["شهریور", "شهريور"]), + "Mehr": (7, 30, ["مهر"]), + "Aban": (8, 30, ["آبان"]), + "Azar": (9, 30, ["آذر"]), + "Dey": (10, 30, ["دی"]), + "Bahman": (11, 30, ["بهمن", "بهن"]), + "Esfand": (12, 29, ["اسفند"]), + } + + _weekdays = { + "Sunday": ["یکشنبه"], + "Monday": ["دوشنبه"], + "Tuesday": ["سهشنبه", "سه شنبه"], + "Wednesday": ["چهارشنبه", "چهار شنبه"], + "Thursday": ["پنجشنبه", "پنج شنبه"], + "Friday": ["جمعه"], + "Saturday": ["روز شنبه", "شنبه"], + } _number_letters = { 0: ["صفر"], diff --git a/dateparser/data/date_translation_data/en.py b/dateparser/data/date_translation_data/en.py index f348598f8..39241c07c 100644 --- a/dateparser/data/date_translation_data/en.py +++ b/dateparser/data/date_translation_data/en.py @@ -800,6 +800,9 @@ "after" ], "simplifications": [ + { + "(\\d+[.,]?\\d*)\\s*mons?\\b": "\\1 month" + }, { "an": "1" }, diff --git a/dateparser/languages/loader.py b/dateparser/languages/loader.py index 023e02eaa..e6d5225f2 100644 --- a/dateparser/languages/loader.py +++ b/dateparser/languages/loader.py @@ -158,7 +158,7 @@ def _load_data( use_given_order=False, allow_conflicting_locales=False, ): - locale_dict = OrderedDict() + locale_dict = {} if locales: invalid_locales = [] for locale in locales: @@ -198,7 +198,7 @@ def _load_data( ) if not use_given_order: - locale_dict = OrderedDict( + locale_dict = dict( sorted(locale_dict.items(), key=lambda x: language_order.index(x[1][0])) ) diff --git a/dateparser/languages/locale.py b/dateparser/languages/locale.py index bf33f4375..553a54609 100644 --- a/dateparser/languages/locale.py +++ b/dateparser/languages/locale.py @@ -1,4 +1,3 @@ -from collections import OrderedDict from itertools import chain import regex as re @@ -226,7 +225,7 @@ def _get_relative_translations(self, settings=None): def _generate_relative_translations(self, normalize=False): relative_translations = self.info.get("relative-type-regex", {}) - relative_dictionary = OrderedDict() + relative_dictionary = {} for key, value in relative_translations.items(): if normalize: value = list(map(normalize_unicode, value)) diff --git a/dateparser/parser.py b/dateparser/parser.py index f123ab33a..54bde2ceb 100644 --- a/dateparser/parser.py +++ b/dateparser/parser.py @@ -1,5 +1,4 @@ import calendar -from collections import OrderedDict from datetime import datetime, timedelta, timezone from io import StringIO @@ -239,12 +238,10 @@ def _check_strict_parsing(missing, settings): class _parser: - alpha_directives = OrderedDict( - [ - ("weekday", ["%A", "%a"]), - ("month", ["%B", "%b"]), - ] - ) + alpha_directives = { + "weekday": ["%A", "%a"], + "month": ["%B", "%b"], + } num_directives = { "month": ["%m"], @@ -273,10 +270,10 @@ def __init__(self, tokens, settings): self._token_year = None self._token_time = None - self.ordered_num_directives = OrderedDict( - (k, self.num_directives[k]) - for k in (resolve_date_order(settings.DATE_ORDER, lst=True)) - ) + self.ordered_num_directives = { + k: self.num_directives[k] + for k in resolve_date_order(settings.DATE_ORDER, lst=True) + } skip_index = [] skip_component = None diff --git a/dateparser_data/supplementary_language_data/date_translation_data/en.yaml b/dateparser_data/supplementary_language_data/date_translation_data/en.yaml index ac07a0ef2..a4e9a1ad8 100644 --- a/dateparser_data/supplementary_language_data/date_translation_data/en.yaml +++ b/dateparser_data/supplementary_language_data/date_translation_data/en.yaml @@ -82,6 +82,7 @@ relative-type-regex: - (\d+[.,]?\d*) decades? ago simplifications: + - (\d+[.,]?\d*)\s*mons?\b: \1 month - an: '1' - a: '1' - (?:12\s+)?noon: '12:00' diff --git a/dateparser_scripts/get_cldr_data.py b/dateparser_scripts/get_cldr_data.py index 6e227c819..2e5b77185 100644 --- a/dateparser_scripts/get_cldr_data.py +++ b/dateparser_scripts/get_cldr_data.py @@ -1,7 +1,6 @@ import json import os import shutil -from collections import OrderedDict import regex as re @@ -49,17 +48,17 @@ def _retrieve_locale_data(locale): ca_gregorian_file = cldr_dates_full_dir + locale + "/ca-gregorian.json" dateFields_file = cldr_dates_full_dir + locale + "/dateFields.json" with open(ca_gregorian_file) as f: - cldr_gregorian_data = json.load(f, object_pairs_hook=OrderedDict) + cldr_gregorian_data = json.load(f) with open(dateFields_file) as g: - cldr_datefields_data = json.load(g, object_pairs_hook=OrderedDict) + cldr_datefields_data = json.load(g) gregorian_dict = cldr_gregorian_data["main"][locale]["dates"]["calendars"][ "gregorian" ] date_fields_dict = cldr_datefields_data["main"][locale]["dates"]["fields"] - json_dict = OrderedDict() + json_dict = {} field_keys_1 = ["stand-alone", "format"] field_keys_2 = [ @@ -292,7 +291,7 @@ def _retrieve_locale_data(locale): json_dict["second"] = [date_fields_dict[key]["displayName"] for key in second_keys] - json_dict["relative-type"] = OrderedDict() + json_dict["relative-type"] = {} json_dict["relative-type"]["1 year ago"] = [ date_fields_dict[key]["relative-type--1"] for key in year_keys @@ -354,7 +353,7 @@ def _retrieve_locale_data(locale): date_fields_dict[key]["relative-type-0"] for key in second_keys ] - json_dict["relative-type-regex"] = OrderedDict() + json_dict["relative-type-regex"] = {} json_dict["relative-type-regex"]["in \\1 year"] = list( filter( @@ -524,11 +523,11 @@ def _clean_dict(json_dict): """Remove duplicates and sort""" for key, value in json_dict.items(): if isinstance(value, list): - json_dict[key] = sorted(OrderedDict.fromkeys(map(_clean_string, value))) + json_dict[key] = sorted(dict.fromkeys(map(_clean_string, value))) elif isinstance(value, dict): - json_dict[key] = OrderedDict(sorted(value.items())) + json_dict[key] = dict(sorted(value.items())) json_dict[key] = _clean_dict(json_dict[key]) - return OrderedDict(filter(lambda x: x[1], json_dict.items())) + return dict(filter(lambda x: x[1], json_dict.items())) def main(): @@ -544,14 +543,14 @@ def main(): for language in language_locale_dict: json_language_dict = _clean_dict(_retrieve_locale_data(language)) - locale_specific_dict = OrderedDict() + locale_specific_dict = {} locales_list = language_locale_dict[language] for locale in locales_list: json_locale_dict = _clean_dict(_retrieve_locale_data(locale)) locale_specific_dict[locale] = _clean_dict( get_dict_difference(json_language_dict, json_locale_dict) ) - json_language_dict["locale_specific"] = OrderedDict( + json_language_dict["locale_specific"] = dict( sorted(locale_specific_dict.items()) ) filename = directory + language + ".json" diff --git a/dateparser_scripts/order_languages.py b/dateparser_scripts/order_languages.py index 39fa6affe..bdf67d7a7 100644 --- a/dateparser_scripts/order_languages.py +++ b/dateparser_scripts/order_languages.py @@ -1,6 +1,5 @@ import json import os -from collections import OrderedDict import regex as re import requests @@ -186,7 +185,7 @@ def main(): language_order, separators=(",", ": "), indent=4 ) - complete_language_locale_dict = OrderedDict() + complete_language_locale_dict = {} for key in language_order: if key in language_locale_dict.keys(): complete_language_locale_dict[key] = sorted(language_locale_dict[key]) diff --git a/dateparser_scripts/utils.py b/dateparser_scripts/utils.py index a8ba6d1c5..feae9fb3a 100644 --- a/dateparser_scripts/utils.py +++ b/dateparser_scripts/utils.py @@ -1,6 +1,5 @@ import os import shutil -from collections import OrderedDict from git import Repo @@ -36,7 +35,7 @@ def get_raw_data(): def get_dict_difference(parent_dict, child_dict): - difference_dict = OrderedDict() + difference_dict = {} for key, child_value in child_dict.items(): parent_value = parent_dict.get(key) child_specific_value = None @@ -54,7 +53,7 @@ def get_dict_difference(parent_dict, child_dict): def combine_dicts(primary_dict, supplementary_dict): - combined_dict = OrderedDict() + combined_dict = {} for key, value in primary_dict.items(): if key in supplementary_dict: if isinstance(value, list): diff --git a/dateparser_scripts/write_complete_data.py b/dateparser_scripts/write_complete_data.py index 903b8f383..5bb015f5e 100644 --- a/dateparser_scripts/write_complete_data.py +++ b/dateparser_scripts/write_complete_data.py @@ -1,7 +1,6 @@ import json import os import shutil -from collections import OrderedDict import regex as re from ruamel.yaml import YAML @@ -28,8 +27,27 @@ RELATIVE_PATTERN = re.compile(r"\{0\}") +def _to_plain_types(obj): + """Recursively convert ruamel.yaml CommentedMap/CommentedSeq to plain + OrderedDict/list so that json.dumps produces stable output across all + Python versions. + + Python 3.14 changed the json C encoder to bypass the Python-level + ``__iter__``/``items()`` of dict subclasses and access the underlying C + dict directly. ruamel.yaml's CommentedMap relies on its Python-level + iteration for correct key ordering, so the C shortcut produces a + different (non-deterministic) key order on 3.14. Converting to plain + types before serialisation avoids the issue entirely. + """ + if isinstance(obj, dict): + return OrderedDict((k, _to_plain_types(v)) for k, v in obj.items()) + elif isinstance(obj, list): + return [_to_plain_types(v) for v in obj] + return obj + + def _modify_relative_data(relative_data): - modified_relative_data = OrderedDict() + modified_relative_data = {} for key, value in relative_data.items(): for i, string in enumerate(value): string = RELATIVE_PATTERN.sub(r"(\\d+[.,]?\\d*)", string) @@ -52,11 +70,11 @@ def _get_complete_date_translation_data(language): supplementary_data = {} if language in cldr_languages: with open(cldr_date_directory + language + ".json") as f: - cldr_data = json.load(f, object_pairs_hook=OrderedDict) + cldr_data = json.load(f) if language in supplementary_languages: with open(supplementary_date_directory + language + ".yaml") as g: yaml = YAML() - supplementary_data = OrderedDict(yaml.load(g)) + supplementary_data = dict(yaml.load(g)) complete_data = combine_dicts(cldr_data, supplementary_data) if "name" not in complete_data: complete_data["name"] = language @@ -95,6 +113,7 @@ def write_complete_data(in_memory=False): for language in all_languages: date_translation_data = _get_complete_date_translation_data(language) date_translation_data = combine_dicts(date_translation_data, base_data) + date_translation_data = _to_plain_types(date_translation_data) _modify_data(date_translation_data) translation_data = json.dumps( date_translation_data, indent=4, separators=(",", ": "), ensure_ascii=False diff --git a/tests/test_date.py b/tests/test_date.py index 249919ae4..7c8759b89 100644 --- a/tests/test_date.py +++ b/tests/test_date.py @@ -3,7 +3,6 @@ import datetime as real_datetime import os import unittest -from collections import OrderedDict from datetime import datetime, timedelta from datetime import timezone as dttz from itertools import product @@ -615,18 +614,15 @@ def test_temporal_nouns_are_parsed(self, date_string, days_ago): self.then_date_is_n_days_ago(days=days_ago) def test_should_not_assume_language_too_early(self): - dates_to_parse = OrderedDict( - [ - ("07/07/2014", datetime(2014, 7, 7).date()), # any language - ("07.jul.2014 | 12:52", datetime(2014, 7, 7).date()), # en, es, pt, nl - ("07.ago.2014 | 12:52", datetime(2014, 8, 7).date()), # es, it, pt - ( - "07.feb.2014 | 12:52", - datetime(2014, 2, 7).date(), - ), # en, de, es, it, nl, ro - ("07.ene.2014 | 12:52", datetime(2014, 1, 7).date()), - ] - ) # es + dates_to_parse = { + "07/07/2014": datetime(2014, 7, 7).date(), # any language + "07.jul.2014 | 12:52": datetime(2014, 7, 7).date(), # en, es, pt, nl + "07.ago.2014 | 12:52": datetime(2014, 8, 7).date(), # es, it, pt + "07.feb.2014 | 12:52": datetime( + 2014, 2, 7 + ).date(), # en, de, es, it, nl, ro + "07.ene.2014 | 12:52": datetime(2014, 1, 7).date(), # es + } self.given_parser( restrict_to_languages=["en", "de", "fr", "it", "pt", "nl", "ro", "es", "ru"] diff --git a/tests/test_freshness_date_parser.py b/tests/test_freshness_date_parser.py index ad73de63c..34dc6a634 100644 --- a/tests/test_freshness_date_parser.py +++ b/tests/test_freshness_date_parser.py @@ -2374,15 +2374,19 @@ def test_dates_not_supported_by_date_time(self, date_string): @parameterized.expand( [ - param("1mon ago"), # 1116 + param("1mon ago", ago={"months": 1}, period="month"), # 1123 + param("2mon ago", ago={"months": 2}, period="month"), # 1123 + param("3mons ago", ago={"months": 3}, period="month"), # 1123 ] ) - def test_known_issues(self, date_string): + def test_known_issues(self, date_string, ago, period): self.given_parser() self.given_date_string(date_string) self.when_date_is_parsed() self.then_error_was_not_raised() - self.assertEqual(None, self.result["date_obj"]) + self.then_date_was_parsed_by_freshness_parser() + self.then_date_obj_is_exactly_this_time_ago(ago) + self.then_period_is(period) @parameterized.expand( [