Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 47 additions & 2 deletions dateparser/languages/locale.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,53 @@ def translate(self, date_string, keep_formatting=False, settings=None):
date_string_tokens[i] = dictionary[word] or fallback
if "in" in date_string_tokens:
date_string_tokens = self._clear_future_words(date_string_tokens)

# Remove empty tokens (skip words) and handle adjacent whitespace
# When a skip token is removed between spaces, keep the maximum number of spaces
filtered_tokens = []
i = 0
while i < len(date_string_tokens):
token = date_string_tokens[i]

# Skip empty tokens (removed skip words)
if not token:
# Count preceding spaces already in filtered_tokens
prev_spaces = 0
j = len(filtered_tokens) - 1
while j >= 0 and filtered_tokens[j] == " ":
prev_spaces += 1
j -= 1

# Count following spaces in the remaining tokens
next_spaces = 0
j = i + 1
while j < len(date_string_tokens) and date_string_tokens[j] == " ":
next_spaces += 1
j += 1

# If surrounded by spaces, keep max(prev_spaces, next_spaces)
if prev_spaces > 0 and next_spaces > 0:
# Remove prev_spaces from filtered_tokens
for _ in range(prev_spaces):
filtered_tokens.pop()

# Add back the maximum number of spaces
max_spaces = max(prev_spaces, next_spaces)
for _ in range(max_spaces):
filtered_tokens.append(" ")

# Skip the empty token and all following spaces
i += next_spaces + 1
continue

i += 1
continue

filtered_tokens.append(token)
i += 1

return self._join(
list(filter(bool, date_string_tokens)),
filtered_tokens,
separator="" if keep_formatting else " ",
settings=settings,
)
Expand Down Expand Up @@ -511,7 +556,7 @@ def _generate_simplifications(self, normalize=False):
def _clear_future_words(self, words):
freshness_words = {"day", "week", "month", "year", "hour", "minute", "second"}
if set(words).isdisjoint(freshness_words):
words.remove("in")
words[words.index("in")] = ""
return words

def _join(self, tokens, separator=" ", settings=None):
Expand Down
74 changes: 45 additions & 29 deletions tests/test_languages.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def setUp(self):
# French
param("fr", "20 Février 2012", "20 february 2012"),
param("fr", "Mercredi 19 Novembre 2013", "wednesday 19 november 2013"),
param("fr", "18 octobre 2012 à 19 h 21 min", "18 october 2012 19:21"),
param("fr", "18 octobre 2012 à 19 h 21 min", "18 october 2012 19:21"),
# German
param("de", "29. Juni 2007", "29. june 2007"),
param("de", "Montag 5 Januar, 2015", "monday 5 january 2015"),
Expand Down Expand Up @@ -109,49 +109,57 @@ def setUp(self):
param("it", "Giovedi Maggio 29 2013", "thursday may 29 2013"),
param("it", "19 Luglio 2013", "19 july 2013"),
# Portuguese
param("pt", "22 de dezembro de 2014 às 02:38", "22 december 2014 02:38"),
param("pt", "22 de dezembro de 2014 às 02:38", "22 december 2014 02:38"),
# Russian
param("ru", "5 августа 2014 г. в 12:00", "5 august 2014 year. 12:00"),
param("ru", "5 августа 2014 г. в 12:00", "5 august 2014 year. 12:00"),
# Turkish
param("tr", "2 Ocak 2015 Cuma, 16:49", "2 january 2015 friday 16:49"),
# Czech
param("cs", "22. prosinec 2014 v 2:38", "22. december 2014 2:38"),
param(
"cs", "22. prosinec 2014 v 2:38", "22. december 2014 2:38"
), # Issue #1302: "v" → "in" → cleared, whitespace preserved
# Dutch
param(
"nl",
"maandag 22 december 2014 om 2:38",
"monday 22 december 2014 2:38",
"monday 22 december 2014 2:38",
),
# Romanian
param("ro", "22 Decembrie 2014 la 02:38", "22 december 2014 02:38"),
param("ro", "22 Decembrie 2014 la 02:38", "22 december 2014 02:38"),
# Polish
param("pl", "4 stycznia o 13:50", "4 january 13:50"),
param("pl", "29 listopada 2014 o 08:40", "29 november 2014 08:40"),
param("pl", "4 stycznia o 13:50", "4 january 13:50"),
param(
"pl", "29 listopada 2014 o 08:40", "29 november 2014 08:40"
), # Issue #1302: "o" removed, whitespace preserved
# Ukrainian
param("uk", "30 листопада 2013 о 04:27", "30 november 2013 04:27"),
param("uk", "22 верес 2021 о 07:37", "22 september 2021 07:37"),
param("uk", "28 лютого 2020 року об 11:57", "28 february 2020 year 11:57"),
param("uk", "30 листопада 2013 о 04:27", "30 november 2013 04:27"),
param("uk", "22 верес 2021 о 07:37", "22 september 2021 07:37"),
param("uk", "28 лютого 2020 року об 11:57", "28 february 2020 year 11:57"),
param(
"uk",
"середу, 28 лютого 2020 року об 11:57",
"wednesday 28 february 2020 year 11:57",
"wednesday 28 february 2020 year 11:57",
),
param(
"uk",
"понед, 12 вересня 2022 року об 09:22",
"monday 12 september 2022 year 09:22",
"monday 12 september 2022 year 09:22",
),
# Belarusian
param("be", "5 снежня 2015 г. у 12:00", "5 december 2015 year. 12:00"),
param("be", "11 верасня 2015 г. у 12:11", "11 september 2015 year. 12:11"),
param("be", "3 стд 2015 г. у 10:33", "3 january 2015 year. 10:33"),
param("be", "5 снежня 2015 г. у 12:00", "5 december 2015 year. 12:00"),
param("be", "11 верасня 2015 г. у 12:11", "11 september 2015 year. 12:11"),
param("be", "3 стд 2015 г. у 10:33", "3 january 2015 year. 10:33"),
# Arabic
param("ar", "6 يناير، 2015، الساعة 05:16 مساءً", "6 january 2015 05:16 pm"),
param("ar", "7 يناير، 2015، الساعة 11:00 صباحاً", "7 january 2015 11:00 am"),
# Vietnamese
param("vi", "Thứ Năm, ngày 8 tháng 1 năm 2015", "thursday 8 january 2015"),
param("vi", "Thứ Tư, 07/01/2015 | 22:34", "wednesday 07/01/2015 22:34"),
param("vi", "9 Tháng 1 2015 lúc 15:08", "9 january 2015 15:08"),
param(
"vi", "Thứ Tư, 07/01/2015 | 22:34", "wednesday 07/01/2015 22:34"
), # Pipe between spaces preserved
param(
"vi", "9 Tháng 1 2015 lúc 15:08", "9 january 2015 15:08"
), # Issue #1302: "lúc" removed, whitespace preserved
# Thai
param(
"th",
Expand Down Expand Up @@ -184,11 +192,13 @@ def setUp(self):
param("en", "2014-12-12T12:33:39-08:00", "2014-12-12 12:33:39-08:00"),
param("en", "2014-10-15T16:12:20+00:00", "2014-10-15 16:12:20+00:00"),
param("en", "28 Oct 2014 16:39:01 +0000", "28 october 2014 16:39:01 +0000"),
param("es", "13 Febrero 2015 a las 23:00", "13 february 2015 23:00"),
param("es", "13 Febrero 2015 a las 23:00", "13 february 2015 23:00"),
# Danish
param("da", "Sep 03 2014", "september 03 2014"),
param("da", "fredag, 03 september 2014", "friday 03 september 2014"),
param("da", "fredag d. 3 september 2014", "friday 3 september 2014"),
param(
"da", "fredag d. 3 september 2014", "friday 3 september 2014"
), # Issue #1302: 'd.' removed, whitespace preserved
# Finnish
param("fi", "maanantai tammikuu 16, 2015", "monday january 16 2015"),
param("fi", "ma tammi 16, 2015", "monday january 16 2015"),
Expand Down Expand Up @@ -216,7 +226,9 @@ def setUp(self):
param("fi", "su joulu 16, 2015", "sunday december 16 2015"),
param("fi", "1. tammikuuta, 2016", "1. january 2016"),
param("fi", "tiistaina, 27. lokakuuta 2015", "tuesday 27. october 2015"),
param("fi", "28 maalis klo 9:37", "28 march 9:37"),
param(
"fi", "28 maalis klo 9:37", "28 march 9:37"
), # Issue #1302: preserve single space
# Japanese
param("ja", "午後3時", "pm 3:00"),
param("ja", "2時", "2:00"),
Expand All @@ -239,7 +251,7 @@ def setUp(self):
# Hebrew
param("he", "20 לאפריל 2012", "20 april 2012"),
param("he", "יום רביעי ה-19 בנובמבר 2013", "wednesday 19 november 2013"),
param("he", "18 לאוקטובר 2012 בשעה 19:21", "18 october 2012 19:21"),
param("he", "18 לאוקטובר 2012 בשעה 19:21", "18 october 2012 19:21"),
param("he", "יום ה' 6/10/2016", "thursday 6/10/2016"),
param("he", "חצות", "12 am"),
param("he", "1 אחר חצות", "1 am"),
Expand Down Expand Up @@ -497,7 +509,7 @@ def setUp(self):
param("hr", "2 ožujak 1980 pet", "2 march 1980 friday"),
param("hr", "nedjelja 3 lis 1879", "sunday 3 october 1879"),
param("hr", "06. travnja 2021.", "06. april 2021."),
param("hr", "13. svibanj 2022. u 14:34", "13. may 2022. 14:34"),
param("hr", "13. svibanj 2022. u 14:34", "13. may 2022. 14:34"),
param("hr", "20. studenoga 2010. @ 07:28", "20. november 2010. 07:28"),
param("hr", "13. studenog 1989.", "13. november 1989."),
param("hr", "u listopadu 2056.", " october 2056."),
Expand Down Expand Up @@ -1210,7 +1222,9 @@ def test_translation(self, shortname, datetime_string, expected_translation):
param("ar", "اليوم", "0 day ago"),
# Polish
param("pl", "2 godz.", "2 hour."),
param("pl", "Wczoraj o 07:40", "1 day ago 07:40"),
param(
"pl", "Wczoraj o 07:40", "1 day ago 07:40"
), # Issue #1302: fixed double space
# Vietnamese
param("vi", "2 tuần 3 ngày", "2 week 3 day"),
param("vi", "21 giờ trước", "21 hour ago"),
Expand Down Expand Up @@ -1249,10 +1263,10 @@ def test_translation(self, shortname, datetime_string, expected_translation):
param("id", "hari ini", "0 day ago"),
param("id", "kemarin", "1 day ago"),
param("id", "kemarin lusa", "2 day ago"),
param("id", "sehari yang lalu", "1 day ago"),
param("id", "seminggu yang lalu", "1 week ago"),
param("id", "sebulan yang lalu", "1 month ago"),
param("id", "setahun yang lalu", "1 year ago"),
param("id", "sehari yang lalu", "1 day ago"),
param("id", "seminggu yang lalu", "1 week ago"),
param("id", "sebulan yang lalu", "1 month ago"),
param("id", "setahun yang lalu", "1 year ago"),
# Finnish
param("fi", "1 vuosi sitten", "1 year ago"),
param("fi", "2 vuotta sitten", "2 year ago"),
Expand Down Expand Up @@ -1320,7 +1334,9 @@ def test_translation(self, shortname, datetime_string, expected_translation):
param("ja", "明後日", "in 2 day"),
# Hebrew
param("he", "אתמול", "1 day ago"),
param("he", "אתמול בשעה 3", "1 day ago 3"),
param(
"he", "אתמול בשעה 3", "1 day ago 3"
), # Issue #1302: "בשעה" removed, whitespace preserved
param("he", "היום", "0 day ago"),
param("he", "לפני יומיים", "2 day ago"),
param("he", "לפני שבועיים", "2 week ago"),
Expand Down
103 changes: 103 additions & 0 deletions tests/test_whitespace_preservation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""
Tests for issue #1302: Whitespace preservation during translation
"""

from parameterized import param, parameterized

from dateparser.conf import settings
from dateparser.languages import default_loader
from tests import BaseTestCase


class TestWhitespacePreservation(BaseTestCase):
"""
Tests to ensure that whitespace is preserved exactly when translating
date strings, even when tokens are removed from the skip list (e.g., "klo" in Finnish).

Issue #1302: Extra whitespace handling during date translation
"""

def setUp(self):
super().setUp()
self.language = NotImplemented
self.datetime_string = NotImplemented
self.translation = NotImplemented
self.settings = settings

@parameterized.expand(
[
# Finnish: "klo" is a skip word
param("fi", "28 maalis klo 9:37", "28 march 9:37"),
param("fi", "28 maalis klo 9:37", "28 march 9:37"),
param("fi", "28 maalis klo 9:37", "28 march 9:37"),
param("fi", "28 maalis klo 9:37", "28 march 9:37"),
param(
"fi", "tiistaina 27. lokakuuta 2015", "tuesday 27. october 2015"
),
# Czech: "v" translates to "in", then cleared by _clear_future_words
param("cs", "22. prosinec 2014 v 2:38", "22. december 2014 2:38"),
param("cs", "22. prosinec 2014 v 2:38", "22. december 2014 2:38"),
# Polish: "o" is a skip word
param("pl", "4 stycznia o 13:50", "4 january 13:50"),
param("pl", "29 listopada 2014 o 08:40", "29 november 2014 08:40"),
# Russian: "в" is a skip word
param("ru", "5 августа 2014 г. в 12:00", "5 august 2014 year. 12:00"),
# Ukrainian: "о"/"об" are skip words
param("uk", "30 листопада 2013 о 04:27", "30 november 2013 04:27"),
# Croatian: "u" translates to "in", then cleared
param("hr", "13. svibanj 2022. u 14:34", "13. may 2022. 14:34"),
]
)
def test_whitespace_preservation_during_translation(
self, shortname, datetime_string, expected_translation
):
"""Test that exact whitespace is preserved when translating date strings."""
self.given_bundled_language(shortname)
self.given_string(datetime_string)
self.when_datetime_string_translated()
self.then_string_translated_to(expected_translation)

@parameterized.expand(
[
# Finnish: keep_formatting=True
param("fi", "28 maalis klo 9:37", "28 march 9:37"),
param("fi", "28 maalis klo 9:37", "28 march 9:37"),
param("fi", "28 maalis klo 9:37", "28 march 9:37"),
# Czech: keep_formatting=True
param("cs", "22. prosinec 2014 v 2:38", "22. december 2014 2:38"),
# Polish: keep_formatting=True
param("pl", "4 stycznia o 13:50", "4 january 13:50"),
]
)
def test_whitespace_preservation_keep_formatting(
self, shortname, datetime_string, expected_translation
):
"""Test whitespace preservation with keep_formatting=True."""
self.given_bundled_language(shortname)
self.given_string(datetime_string)
self.when_datetime_string_translated_keep_formatting()
self.then_string_translated_to(expected_translation)

def given_bundled_language(self, shortname):
self.language = default_loader.get_locale(shortname)

def given_string(self, datetime_string):
self.datetime_string = datetime_string

def when_datetime_string_translated(self):
self.translation = self.language.translate(
self.datetime_string, settings=self.settings
)

def when_datetime_string_translated_keep_formatting(self):
self.translation = self.language.translate(
self.datetime_string, keep_formatting=True, settings=self.settings
)

def then_string_translated_to(self, expected_string):
self.assertEqual(
expected_string,
self.translation,
f"\nExpected: |{expected_string}|\nGot: |{self.translation}|\n"
f"Input: |{self.datetime_string}|",
)
Loading