gh-137627: Make csv.Sniffer.sniff() delimiter detection 1.6x faster (#137628)

Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
This commit is contained in:
Maurycy Pawłowski-Wieroński 2025-10-23 14:28:29 +02:00 committed by GitHub
parent aa9d0a61d5
commit 6be6f8ff59
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 69 additions and 16 deletions

View file

@ -652,11 +652,11 @@ zlib
Optimizations Optimizations
============= =============
module_name csv
----------- ---
* TODO
* :meth:`csv.Sniffer.sniff` delimiter detection is now up to 1.6x faster.
(Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.)
Removed Removed

View file

@ -362,31 +362,33 @@ def _guess_delimiter(self, data, delimiters):
try and evaluate the smallest portion of the data possible, evaluating try and evaluate the smallest portion of the data possible, evaluating
additional chunks as necessary. additional chunks as necessary.
""" """
from collections import Counter, defaultdict
data = list(filter(None, data.split('\n'))) data = list(filter(None, data.split('\n')))
ascii = [chr(c) for c in range(127)] # 7-bit ASCII
# build frequency tables # build frequency tables
chunkLength = min(10, len(data)) chunkLength = min(10, len(data))
iteration = 0 iteration = 0
charFrequency = {} num_lines = 0
# {char -> {count_per_line -> num_lines_with_that_count}}
char_frequency = defaultdict(Counter)
modes = {} modes = {}
delims = {} delims = {}
start, end = 0, chunkLength start, end = 0, chunkLength
while start < len(data): while start < len(data):
iteration += 1 iteration += 1
for line in data[start:end]: for line in data[start:end]:
for char in ascii: num_lines += 1
metaFrequency = charFrequency.get(char, {}) for char, count in Counter(line).items():
# must count even if frequency is 0 if char.isascii():
freq = line.count(char) char_frequency[char][count] += 1
# value is the mode
metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
charFrequency[char] = metaFrequency
for char in charFrequency.keys(): for char, counts in char_frequency.items():
items = list(charFrequency[char].items()) items = list(counts.items())
missed_lines = num_lines - sum(counts.values())
if missed_lines:
# Store the number of lines 'char' was missing from.
items.append((0, missed_lines))
if len(items) == 1 and items[0][0] == 0: if len(items) == 1 and items[0][0] == 0:
continue continue
# get the mode of the frequencies # get the mode of the frequencies

View file

@ -1437,6 +1437,56 @@ def test_doublequote(self):
dialect = sniffer.sniff(self.sample9) dialect = sniffer.sniff(self.sample9)
self.assertTrue(dialect.doublequote) self.assertTrue(dialect.doublequote)
def test_guess_delimiter_crlf_not_chosen(self):
# Ensure that we pick the real delimiter ("|") over "\r" in a tie.
sniffer = csv.Sniffer()
sample = "a|b\r\nc|d\r\ne|f\r\n"
self.assertEqual(sniffer.sniff(sample).delimiter, "|")
self.assertNotEqual(sniffer.sniff(sample).delimiter, "\r")
def test_zero_mode_tie_order_independence(self):
sniffer = csv.Sniffer()
# ":" appears in half the rows (1, 0, 1, 0) - a tie between
# 0 and 1 per line.
# "," appears once every row (true delimiter).
#
# Even if the zero-frequency bucket is appended vs. inserted, the tie
# yields an adjusted score of 0, so ":" should not be promoted and
# "," must be selected.
sample = (
"a,b:c\n"
"d,e\n"
"f,g:c\n"
"h,i\n"
)
dialect = sniffer.sniff(sample)
self.assertEqual(dialect.delimiter, ",")
def test_zero_mode_tie_order_comma_first(self):
sniffer = csv.Sniffer()
pattern = (
"a,b\n"
"c:d\n"
"e,f\n"
"g:h\n"
)
sample = pattern * 10
with self.assertRaisesRegex(csv.Error, "Could not determine delimiter"):
sniffer.sniff(sample)
def test_zero_mode_tie_order_colon_first(self):
sniffer = csv.Sniffer()
pattern = (
"a:b\n"
"c,d\n"
"e:f\n"
"g,h\n"
)
sample = pattern * 10
with self.assertRaisesRegex(csv.Error, "Could not determine delimiter"):
sniffer.sniff(sample)
class NUL: class NUL:
def write(s, *args): def write(s, *args):
pass pass

View file

@ -0,0 +1 @@
Speed up :meth:`csv.Sniffer.sniff` delimiter detection by up to 1.6x.