[3.12] gh-53203: Fix strptime() for %c and %x formats on many locales (GH-124946) (GH-125370)

In some locales (like French or Hebrew) the full or abbreviated names of
the default month and weekday used in __calc_date_time can be part of
other name or constant part of the %c format. The month name can also
match %m with constant suffix (like in Japanese). So the code failed to
correctly distinguish formats %a, %A, %b, %B and %m.

Cycle all month and all days of the week to find the variable part
and distinguish %a from %A and %b from %B or %m.

Fixed locales for the following languges:
Arabic, Bislama, Breton, Bodo, Kashubian, Chuvash, Estonian, French, Irish,
Ge'ez, Gurajati, Manx Gaelic, Hebrew, Hindi, Chhattisgarhi, Haitian Kreyol,
Japanese, Kannada, Korean, Marathi, Malay, Norwegian, Nynorsk, Punjabi,
Rajasthani, Tok Pisin, Yoruba, Yue Chinese, Yau/Nungon and Chinese.

(cherry picked from commit c05f9dde8a)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
Co-authored-by: Eli Bendersky <eliben@gmail.com>
This commit is contained in:
Miss Islington (bot) 2024-10-12 20:02:52 +02:00 committed by GitHub
parent e01a1784db
commit 331fc017ce
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 134 additions and 34 deletions

View file

@ -27,6 +27,18 @@ def _getlang():
# Figure out what the current language is set to.
return locale.getlocale(locale.LC_TIME)
def _findall(haystack, needle):
# Find all positions of needle in haystack.
if not needle:
return
i = 0
while True:
i = haystack.find(needle, i)
if i < 0:
break
yield i
i += len(needle)
class LocaleTime(object):
"""Stores and handles locale-specific information related to time.
@ -101,7 +113,8 @@ def __calc_am_pm(self):
am_pm = []
for hour in (1, 22):
time_tuple = time.struct_time((1999,3,17,hour,44,55,2,76,0))
am_pm.append(time.strftime("%p", time_tuple).lower())
# br_FR has AM/PM info (' ',' ').
am_pm.append(time.strftime("%p", time_tuple).lower().strip())
self.am_pm = am_pm
def __calc_date_time(self):
@ -113,42 +126,114 @@ def __calc_date_time(self):
# values within the format string is very important; it eliminates
# possible ambiguity for what something represents.
time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
date_time = [None, None, None]
date_time[0] = time.strftime("%c", time_tuple).lower()
date_time[1] = time.strftime("%x", time_tuple).lower()
date_time[2] = time.strftime("%X", time_tuple).lower()
replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'),
(self.f_month[3], '%B'), (self.a_weekday[2], '%a'),
(self.a_month[3], '%b'), (self.am_pm[1], '%p'),
time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0))
replacement_pairs = [
('1999', '%Y'), ('99', '%y'), ('22', '%H'),
('44', '%M'), ('55', '%S'), ('76', '%j'),
('17', '%d'), ('03', '%m'), ('3', '%m'),
# '3' needed for when no leading zero.
('2', '%w'), ('10', '%I')]
replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone
for tz in tz_values])
for offset,directive in ((0,'%c'), (1,'%x'), (2,'%X')):
current_format = date_time[offset]
for old, new in replacement_pairs:
date_time = []
for directive in ('%c', '%x', '%X'):
current_format = time.strftime(directive, time_tuple).lower()
current_format = current_format.replace('%', '%%')
# The month and the day of the week formats are treated specially
# because of a possible ambiguity in some locales where the full
# and abbreviated names are equal or names of different types
# are equal. See doc of __find_month_format for more details.
lst, fmt = self.__find_weekday_format(directive)
if lst:
current_format = current_format.replace(lst[2], fmt, 1)
lst, fmt = self.__find_month_format(directive)
if lst:
current_format = current_format.replace(lst[3], fmt, 1)
if self.am_pm[1]:
# Must deal with possible lack of locale info
# manifesting itself as the empty string (e.g., Swedish's
# lack of AM/PM info) or a platform returning a tuple of empty
# strings (e.g., MacOS 9 having timezone as ('','')).
if old:
current_format = current_format.replace(old, new)
current_format = current_format.replace(self.am_pm[1], '%p')
for tz_values in self.timezone:
for tz in tz_values:
if tz:
current_format = current_format.replace(tz, "%Z")
for old, new in replacement_pairs:
current_format = current_format.replace(old, new)
# If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
# 2005-01-03 occurs before the first Monday of the year. Otherwise
# %U is used.
time_tuple = time.struct_time((1999,1,3,1,1,1,6,3,0))
if '00' in time.strftime(directive, time_tuple):
if '00' in time.strftime(directive, time_tuple2):
U_W = '%W'
else:
U_W = '%U'
date_time[offset] = current_format.replace('11', U_W)
current_format = current_format.replace('11', U_W)
date_time.append(current_format)
self.LC_date_time = date_time[0]
self.LC_date = date_time[1]
self.LC_time = date_time[2]
def __find_month_format(self, directive):
"""Find the month format appropriate for the current locale.
In some locales (for example French and Hebrew), the default month
used in __calc_date_time has the same name in full and abbreviated
form. Also, the month name can by accident match other part of the
representation: the day of the week name (for example in Morisyen)
or the month number (for example in Japanese). Thus, cycle months
of the year and find all positions that match the month name for
each month, If no common positions are found, the representation
does not use the month name.
"""
full_indices = abbr_indices = None
for m in range(1, 13):
time_tuple = time.struct_time((1999, m, 17, 22, 44, 55, 2, 76, 0))
datetime = time.strftime(directive, time_tuple).lower()
indices = set(_findall(datetime, self.f_month[m]))
if full_indices is None:
full_indices = indices
else:
full_indices &= indices
indices = set(_findall(datetime, self.a_month[m]))
if abbr_indices is None:
abbr_indices = indices
else:
abbr_indices &= indices
if not full_indices and not abbr_indices:
return None, None
if full_indices:
return self.f_month, '%B'
if abbr_indices:
return self.a_month, '%b'
return None, None
def __find_weekday_format(self, directive):
"""Find the day of the week format appropriate for the current locale.
Similar to __find_month_format().
"""
full_indices = abbr_indices = None
for wd in range(7):
time_tuple = time.struct_time((1999, 3, 17, 22, 44, 55, wd, 76, 0))
datetime = time.strftime(directive, time_tuple).lower()
indices = set(_findall(datetime, self.f_weekday[wd]))
if full_indices is None:
full_indices = indices
else:
full_indices &= indices
if self.f_weekday[wd] != self.a_weekday[wd]:
indices = set(_findall(datetime, self.a_weekday[wd]))
if abbr_indices is None:
abbr_indices = indices
else:
abbr_indices &= indices
if not full_indices and not abbr_indices:
return None, None
if full_indices:
return self.f_weekday, '%A'
if abbr_indices:
return self.a_weekday, '%a'
return None, None
def __calc_timezone(self):
# Set self.timezone by using time.tzname.
# Do not worry about possibility of time.tzname[0] == time.tzname[1]
@ -186,7 +271,7 @@ def __init__(self, locale_time=None):
'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
'f': r"(?P<f>[0-9]{1,6})",
'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])",
'I': r"(?P<I>1[0-2]|0[1-9]|[1-9]| [1-9])",
'G': r"(?P<G>\d\d\d\d)",
'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])",
@ -330,8 +415,8 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
_regex_cache[format] = format_regex
found = format_regex.match(data_string)
if not found:
raise ValueError("time data %r does not match format %r" %
(data_string, format))
raise ValueError("time data %r does not match format %r :: /%s/" %
(data_string, format, format_regex.pattern))
if len(data_string) != found.end():
raise ValueError("unconverted data remains: %s" %
data_string[found.end():])

View file

@ -5,6 +5,7 @@
import locale
import re
import os
import platform
import sys
from test import support
from test.support import skip_if_buggy_ucrt_strfptime, run_with_locales
@ -12,6 +13,13 @@
import _strptime
libc_ver = platform.libc_ver()
if libc_ver[0] == 'glibc':
glibc_ver = tuple(map(int, libc_ver[1].split('.')))
else:
glibc_ver = None
class getlang_Tests(unittest.TestCase):
"""Test _getlang"""
def test_basic(self):
@ -476,16 +484,16 @@ def test_bad_timezone(self):
# * Year is not included: ha_NG.
# * Use non-Gregorian calendar: lo_LA, thai, th_TH.
#
# BUG: Generates invalid regexp for br_FR, csb_PL, Arabic.
# BUG: Generates regexp that does not match the current date and time
# for fa_IR, gez_ER, gez_ET, lzh_TW, my_MM, or_IN, shn_MM, yo_NG.
# BUG: Generates regexp that does not match the current date and time
# for fa_IR, gez_ER, gez_ET, lzh_TW, my_MM, or_IN, shn_MM, yo_NG,
# fr_FR, ja_JP, he_IL, ko_KR, zh_CN, etc.
@run_with_locales('LC_TIME', 'C', 'en_US', 'de_DE',
'eu_ES', 'mfe_MU')
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'he_IL', 'eu_ES', 'ar_AE', 'mfe_MU', 'yo_NG',
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
def test_date_time_locale(self):
# Test %c directive
loc = locale.getlocale(locale.LC_TIME)[0]
if glibc_ver and glibc_ver < (2, 31) and loc == 'br_FR':
self.skipTest('%c in locale br_FR does not include time')
now = time.time()
self.roundtrip('%c', slice(0, 6), time.localtime(now))
# 1 hour 20 minutes 30 seconds ago
@ -503,7 +511,9 @@ def test_date_time_locale(self):
# NB: Dates before 1969 do not roundtrip on some locales:
# bo_CN, bo_IN, dz_BT, eu_ES, eu_FR.
@run_with_locales('LC_TIME', 'C', 'en_US', 'de_DE', 'ja_JP')
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'he_IL', 'ar_AE', 'mfe_MU', 'yo_NG',
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
def test_date_time_locale2(self):
# Test %c directive
self.roundtrip('%c', slice(0, 6), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
@ -511,10 +521,9 @@ def test_date_time_locale2(self):
# NB: Does not roundtrip because use non-Gregorian calendar:
# lo_LA, thai, th_TH.
# BUG: Generates regexp that does not match the current date
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM,
# Arabic, ja_JP, ko_KR, zh_CN, etc.
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE',
'he_IL', 'eu_ES')
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'he_IL', 'eu_ES', 'ar_AE')
def test_date_locale(self):
# Test %x directive
now = time.time()
@ -533,7 +542,8 @@ def test_date_locale(self):
support.is_emscripten or support.is_wasi,
"musl libc issue on Emscripten, bpo-46390"
)
@run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP')
@run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'eu_ES', 'ar_AE')
def test_date_locale2(self):
# Test %x directive
self.roundtrip('%x', slice(0, 3), (1900, 1, 1, 0, 0, 0, 0, 1, 0))

View file

@ -0,0 +1,5 @@
Fix :func:`time.strptime` for ``%c`` and ``%x`` formats in many locales:
Arabic, Bislama, Breton, Bodo, Kashubian, Chuvash, Estonian, French, Irish,
Ge'ez, Gurajati, Manx Gaelic, Hebrew, Hindi, Chhattisgarhi, Haitian Kreyol,
Japanese, Kannada, Korean, Marathi, Malay, Norwegian, Nynorsk, Punjabi,
Rajasthani, Tok Pisin, Yoruba, Yue Chinese, Yau/Nungon and Chinese.