diff --git a/dateparser/languages/locale.py b/dateparser/languages/locale.py index 33c14ea96..bf33f4375 100644 --- a/dateparser/languages/locale.py +++ b/dateparser/languages/locale.py @@ -152,8 +152,53 @@ def translate(self, date_string, keep_formatting=False, settings=None): date_string_tokens[i] = dictionary[word] or fallback if "in" in date_string_tokens: date_string_tokens = self._clear_future_words(date_string_tokens) + + # Remove empty tokens (skip words) and handle adjacent whitespace + # When a skip token is removed between spaces, keep the maximum number of spaces + filtered_tokens = [] + i = 0 + while i < len(date_string_tokens): + token = date_string_tokens[i] + + # Skip empty tokens (removed skip words) + if not token: + # Count preceding spaces already in filtered_tokens + prev_spaces = 0 + j = len(filtered_tokens) - 1 + while j >= 0 and filtered_tokens[j] == " ": + prev_spaces += 1 + j -= 1 + + # Count following spaces in the remaining tokens + next_spaces = 0 + j = i + 1 + while j < len(date_string_tokens) and date_string_tokens[j] == " ": + next_spaces += 1 + j += 1 + + # If surrounded by spaces, keep max(prev_spaces, next_spaces) + if prev_spaces > 0 and next_spaces > 0: + # Remove prev_spaces from filtered_tokens + for _ in range(prev_spaces): + filtered_tokens.pop() + + # Add back the maximum number of spaces + max_spaces = max(prev_spaces, next_spaces) + for _ in range(max_spaces): + filtered_tokens.append(" ") + + # Skip the empty token and all following spaces + i += next_spaces + 1 + continue + + i += 1 + continue + + filtered_tokens.append(token) + i += 1 + return self._join( - list(filter(bool, date_string_tokens)), + filtered_tokens, separator="" if keep_formatting else " ", settings=settings, ) @@ -511,7 +556,7 @@ def _generate_simplifications(self, normalize=False): def _clear_future_words(self, words): freshness_words = {"day", "week", "month", "year", "hour", "minute", "second"} if set(words).isdisjoint(freshness_words): - words.remove("in") + words[words.index("in")] = "" return words def _join(self, tokens, separator=" ", settings=None): diff --git a/tests/test_languages.py b/tests/test_languages.py index 070b9babe..52b9f2179 100644 --- a/tests/test_languages.py +++ b/tests/test_languages.py @@ -80,7 +80,7 @@ def setUp(self): # French param("fr", "20 Février 2012", "20 february 2012"), param("fr", "Mercredi 19 Novembre 2013", "wednesday 19 november 2013"), - param("fr", "18 octobre 2012 à 19 h 21 min", "18 october 2012 19:21"), + param("fr", "18 octobre 2012 à 19 h 21 min", "18 october 2012 19:21"), # German param("de", "29. Juni 2007", "29. june 2007"), param("de", "Montag 5 Januar, 2015", "monday 5 january 2015"), @@ -109,49 +109,57 @@ def setUp(self): param("it", "Giovedi Maggio 29 2013", "thursday may 29 2013"), param("it", "19 Luglio 2013", "19 july 2013"), # Portuguese - param("pt", "22 de dezembro de 2014 às 02:38", "22 december 2014 02:38"), + param("pt", "22 de dezembro de 2014 às 02:38", "22 december 2014 02:38"), # Russian - param("ru", "5 августа 2014 г. в 12:00", "5 august 2014 year. 12:00"), + param("ru", "5 августа 2014 г. в 12:00", "5 august 2014 year. 12:00"), # Turkish param("tr", "2 Ocak 2015 Cuma, 16:49", "2 january 2015 friday 16:49"), # Czech - param("cs", "22. prosinec 2014 v 2:38", "22. december 2014 2:38"), + param( + "cs", "22. prosinec 2014 v 2:38", "22. december 2014 2:38" + ), # Issue #1302: "v" → "in" → cleared, whitespace preserved # Dutch param( "nl", "maandag 22 december 2014 om 2:38", - "monday 22 december 2014 2:38", + "monday 22 december 2014 2:38", ), # Romanian - param("ro", "22 Decembrie 2014 la 02:38", "22 december 2014 02:38"), + param("ro", "22 Decembrie 2014 la 02:38", "22 december 2014 02:38"), # Polish - param("pl", "4 stycznia o 13:50", "4 january 13:50"), - param("pl", "29 listopada 2014 o 08:40", "29 november 2014 08:40"), + param("pl", "4 stycznia o 13:50", "4 january 13:50"), + param( + "pl", "29 listopada 2014 o 08:40", "29 november 2014 08:40" + ), # Issue #1302: "o" removed, whitespace preserved # Ukrainian - param("uk", "30 листопада 2013 о 04:27", "30 november 2013 04:27"), - param("uk", "22 верес 2021 о 07:37", "22 september 2021 07:37"), - param("uk", "28 лютого 2020 року об 11:57", "28 february 2020 year 11:57"), + param("uk", "30 листопада 2013 о 04:27", "30 november 2013 04:27"), + param("uk", "22 верес 2021 о 07:37", "22 september 2021 07:37"), + param("uk", "28 лютого 2020 року об 11:57", "28 february 2020 year 11:57"), param( "uk", "середу, 28 лютого 2020 року об 11:57", - "wednesday 28 february 2020 year 11:57", + "wednesday 28 february 2020 year 11:57", ), param( "uk", "понед, 12 вересня 2022 року об 09:22", - "monday 12 september 2022 year 09:22", + "monday 12 september 2022 year 09:22", ), # Belarusian - param("be", "5 снежня 2015 г. у 12:00", "5 december 2015 year. 12:00"), - param("be", "11 верасня 2015 г. у 12:11", "11 september 2015 year. 12:11"), - param("be", "3 стд 2015 г. у 10:33", "3 january 2015 year. 10:33"), + param("be", "5 снежня 2015 г. у 12:00", "5 december 2015 year. 12:00"), + param("be", "11 верасня 2015 г. у 12:11", "11 september 2015 year. 12:11"), + param("be", "3 стд 2015 г. у 10:33", "3 january 2015 year. 10:33"), # Arabic param("ar", "6 يناير، 2015، الساعة 05:16 مساءً", "6 january 2015 05:16 pm"), param("ar", "7 يناير، 2015، الساعة 11:00 صباحاً", "7 january 2015 11:00 am"), # Vietnamese param("vi", "Thứ Năm, ngày 8 tháng 1 năm 2015", "thursday 8 january 2015"), - param("vi", "Thứ Tư, 07/01/2015 | 22:34", "wednesday 07/01/2015 22:34"), - param("vi", "9 Tháng 1 2015 lúc 15:08", "9 january 2015 15:08"), + param( + "vi", "Thứ Tư, 07/01/2015 | 22:34", "wednesday 07/01/2015 22:34" + ), # Pipe between spaces preserved + param( + "vi", "9 Tháng 1 2015 lúc 15:08", "9 january 2015 15:08" + ), # Issue #1302: "lúc" removed, whitespace preserved # Thai param( "th", @@ -184,11 +192,13 @@ def setUp(self): param("en", "2014-12-12T12:33:39-08:00", "2014-12-12 12:33:39-08:00"), param("en", "2014-10-15T16:12:20+00:00", "2014-10-15 16:12:20+00:00"), param("en", "28 Oct 2014 16:39:01 +0000", "28 october 2014 16:39:01 +0000"), - param("es", "13 Febrero 2015 a las 23:00", "13 february 2015 23:00"), + param("es", "13 Febrero 2015 a las 23:00", "13 february 2015 23:00"), # Danish param("da", "Sep 03 2014", "september 03 2014"), param("da", "fredag, 03 september 2014", "friday 03 september 2014"), - param("da", "fredag d. 3 september 2014", "friday 3 september 2014"), + param( + "da", "fredag d. 3 september 2014", "friday 3 september 2014" + ), # Issue #1302: 'd.' removed, whitespace preserved # Finnish param("fi", "maanantai tammikuu 16, 2015", "monday january 16 2015"), param("fi", "ma tammi 16, 2015", "monday january 16 2015"), @@ -216,7 +226,9 @@ def setUp(self): param("fi", "su joulu 16, 2015", "sunday december 16 2015"), param("fi", "1. tammikuuta, 2016", "1. january 2016"), param("fi", "tiistaina, 27. lokakuuta 2015", "tuesday 27. october 2015"), - param("fi", "28 maalis klo 9:37", "28 march 9:37"), + param( + "fi", "28 maalis klo 9:37", "28 march 9:37" + ), # Issue #1302: preserve single space # Japanese param("ja", "午後3時", "pm 3:00"), param("ja", "2時", "2:00"), @@ -239,7 +251,7 @@ def setUp(self): # Hebrew param("he", "20 לאפריל 2012", "20 april 2012"), param("he", "יום רביעי ה-19 בנובמבר 2013", "wednesday 19 november 2013"), - param("he", "18 לאוקטובר 2012 בשעה 19:21", "18 october 2012 19:21"), + param("he", "18 לאוקטובר 2012 בשעה 19:21", "18 october 2012 19:21"), param("he", "יום ה' 6/10/2016", "thursday 6/10/2016"), param("he", "חצות", "12 am"), param("he", "1 אחר חצות", "1 am"), @@ -497,7 +509,7 @@ def setUp(self): param("hr", "2 ožujak 1980 pet", "2 march 1980 friday"), param("hr", "nedjelja 3 lis 1879", "sunday 3 october 1879"), param("hr", "06. travnja 2021.", "06. april 2021."), - param("hr", "13. svibanj 2022. u 14:34", "13. may 2022. 14:34"), + param("hr", "13. svibanj 2022. u 14:34", "13. may 2022. 14:34"), param("hr", "20. studenoga 2010. @ 07:28", "20. november 2010. 07:28"), param("hr", "13. studenog 1989.", "13. november 1989."), param("hr", "u listopadu 2056.", " october 2056."), @@ -1210,7 +1222,9 @@ def test_translation(self, shortname, datetime_string, expected_translation): param("ar", "اليوم", "0 day ago"), # Polish param("pl", "2 godz.", "2 hour."), - param("pl", "Wczoraj o 07:40", "1 day ago 07:40"), + param( + "pl", "Wczoraj o 07:40", "1 day ago 07:40" + ), # Issue #1302: fixed double space # Vietnamese param("vi", "2 tuần 3 ngày", "2 week 3 day"), param("vi", "21 giờ trước", "21 hour ago"), @@ -1249,10 +1263,10 @@ def test_translation(self, shortname, datetime_string, expected_translation): param("id", "hari ini", "0 day ago"), param("id", "kemarin", "1 day ago"), param("id", "kemarin lusa", "2 day ago"), - param("id", "sehari yang lalu", "1 day ago"), - param("id", "seminggu yang lalu", "1 week ago"), - param("id", "sebulan yang lalu", "1 month ago"), - param("id", "setahun yang lalu", "1 year ago"), + param("id", "sehari yang lalu", "1 day ago"), + param("id", "seminggu yang lalu", "1 week ago"), + param("id", "sebulan yang lalu", "1 month ago"), + param("id", "setahun yang lalu", "1 year ago"), # Finnish param("fi", "1 vuosi sitten", "1 year ago"), param("fi", "2 vuotta sitten", "2 year ago"), @@ -1320,7 +1334,9 @@ def test_translation(self, shortname, datetime_string, expected_translation): param("ja", "明後日", "in 2 day"), # Hebrew param("he", "אתמול", "1 day ago"), - param("he", "אתמול בשעה 3", "1 day ago 3"), + param( + "he", "אתמול בשעה 3", "1 day ago 3" + ), # Issue #1302: "בשעה" removed, whitespace preserved param("he", "היום", "0 day ago"), param("he", "לפני יומיים", "2 day ago"), param("he", "לפני שבועיים", "2 week ago"), diff --git a/tests/test_whitespace_preservation.py b/tests/test_whitespace_preservation.py new file mode 100644 index 000000000..8c90a42a3 --- /dev/null +++ b/tests/test_whitespace_preservation.py @@ -0,0 +1,103 @@ +""" +Tests for issue #1302: Whitespace preservation during translation +""" + +from parameterized import param, parameterized + +from dateparser.conf import settings +from dateparser.languages import default_loader +from tests import BaseTestCase + + +class TestWhitespacePreservation(BaseTestCase): + """ + Tests to ensure that whitespace is preserved exactly when translating + date strings, even when tokens are removed from the skip list (e.g., "klo" in Finnish). + + Issue #1302: Extra whitespace handling during date translation + """ + + def setUp(self): + super().setUp() + self.language = NotImplemented + self.datetime_string = NotImplemented + self.translation = NotImplemented + self.settings = settings + + @parameterized.expand( + [ + # Finnish: "klo" is a skip word + param("fi", "28 maalis klo 9:37", "28 march 9:37"), + param("fi", "28 maalis klo 9:37", "28 march 9:37"), + param("fi", "28 maalis klo 9:37", "28 march 9:37"), + param("fi", "28 maalis klo 9:37", "28 march 9:37"), + param( + "fi", "tiistaina 27. lokakuuta 2015", "tuesday 27. october 2015" + ), + # Czech: "v" translates to "in", then cleared by _clear_future_words + param("cs", "22. prosinec 2014 v 2:38", "22. december 2014 2:38"), + param("cs", "22. prosinec 2014 v 2:38", "22. december 2014 2:38"), + # Polish: "o" is a skip word + param("pl", "4 stycznia o 13:50", "4 january 13:50"), + param("pl", "29 listopada 2014 o 08:40", "29 november 2014 08:40"), + # Russian: "в" is a skip word + param("ru", "5 августа 2014 г. в 12:00", "5 august 2014 year. 12:00"), + # Ukrainian: "о"/"об" are skip words + param("uk", "30 листопада 2013 о 04:27", "30 november 2013 04:27"), + # Croatian: "u" translates to "in", then cleared + param("hr", "13. svibanj 2022. u 14:34", "13. may 2022. 14:34"), + ] + ) + def test_whitespace_preservation_during_translation( + self, shortname, datetime_string, expected_translation + ): + """Test that exact whitespace is preserved when translating date strings.""" + self.given_bundled_language(shortname) + self.given_string(datetime_string) + self.when_datetime_string_translated() + self.then_string_translated_to(expected_translation) + + @parameterized.expand( + [ + # Finnish: keep_formatting=True + param("fi", "28 maalis klo 9:37", "28 march 9:37"), + param("fi", "28 maalis klo 9:37", "28 march 9:37"), + param("fi", "28 maalis klo 9:37", "28 march 9:37"), + # Czech: keep_formatting=True + param("cs", "22. prosinec 2014 v 2:38", "22. december 2014 2:38"), + # Polish: keep_formatting=True + param("pl", "4 stycznia o 13:50", "4 january 13:50"), + ] + ) + def test_whitespace_preservation_keep_formatting( + self, shortname, datetime_string, expected_translation + ): + """Test whitespace preservation with keep_formatting=True.""" + self.given_bundled_language(shortname) + self.given_string(datetime_string) + self.when_datetime_string_translated_keep_formatting() + self.then_string_translated_to(expected_translation) + + def given_bundled_language(self, shortname): + self.language = default_loader.get_locale(shortname) + + def given_string(self, datetime_string): + self.datetime_string = datetime_string + + def when_datetime_string_translated(self): + self.translation = self.language.translate( + self.datetime_string, settings=self.settings + ) + + def when_datetime_string_translated_keep_formatting(self): + self.translation = self.language.translate( + self.datetime_string, keep_formatting=True, settings=self.settings + ) + + def then_string_translated_to(self, expected_string): + self.assertEqual( + expected_string, + self.translation, + f"\nExpected: |{expected_string}|\nGot: |{self.translation}|\n" + f"Input: |{self.datetime_string}|", + )