mirror of
https://github.com/python/cpython.git
synced 2025-12-08 06:10:17 +00:00
gh-137627: Make csv.Sniffer.sniff() delimiter detection 1.6x faster (#137628)
Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
This commit is contained in:
parent
aa9d0a61d5
commit
6be6f8ff59
4 changed files with 69 additions and 16 deletions
|
|
@ -652,11 +652,11 @@ zlib
|
||||||
Optimizations
|
Optimizations
|
||||||
=============
|
=============
|
||||||
|
|
||||||
module_name
|
csv
|
||||||
-----------
|
---
|
||||||
|
|
||||||
* TODO
|
|
||||||
|
|
||||||
|
* :meth:`csv.Sniffer.sniff` delimiter detection is now up to 1.6x faster.
|
||||||
|
(Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.)
|
||||||
|
|
||||||
|
|
||||||
Removed
|
Removed
|
||||||
|
|
|
||||||
26
Lib/csv.py
26
Lib/csv.py
|
|
@ -362,31 +362,33 @@ def _guess_delimiter(self, data, delimiters):
|
||||||
try and evaluate the smallest portion of the data possible, evaluating
|
try and evaluate the smallest portion of the data possible, evaluating
|
||||||
additional chunks as necessary.
|
additional chunks as necessary.
|
||||||
"""
|
"""
|
||||||
|
from collections import Counter, defaultdict
|
||||||
|
|
||||||
data = list(filter(None, data.split('\n')))
|
data = list(filter(None, data.split('\n')))
|
||||||
|
|
||||||
ascii = [chr(c) for c in range(127)] # 7-bit ASCII
|
|
||||||
|
|
||||||
# build frequency tables
|
# build frequency tables
|
||||||
chunkLength = min(10, len(data))
|
chunkLength = min(10, len(data))
|
||||||
iteration = 0
|
iteration = 0
|
||||||
charFrequency = {}
|
num_lines = 0
|
||||||
|
# {char -> {count_per_line -> num_lines_with_that_count}}
|
||||||
|
char_frequency = defaultdict(Counter)
|
||||||
modes = {}
|
modes = {}
|
||||||
delims = {}
|
delims = {}
|
||||||
start, end = 0, chunkLength
|
start, end = 0, chunkLength
|
||||||
while start < len(data):
|
while start < len(data):
|
||||||
iteration += 1
|
iteration += 1
|
||||||
for line in data[start:end]:
|
for line in data[start:end]:
|
||||||
for char in ascii:
|
num_lines += 1
|
||||||
metaFrequency = charFrequency.get(char, {})
|
for char, count in Counter(line).items():
|
||||||
# must count even if frequency is 0
|
if char.isascii():
|
||||||
freq = line.count(char)
|
char_frequency[char][count] += 1
|
||||||
# value is the mode
|
|
||||||
metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
|
|
||||||
charFrequency[char] = metaFrequency
|
|
||||||
|
|
||||||
for char in charFrequency.keys():
|
for char, counts in char_frequency.items():
|
||||||
items = list(charFrequency[char].items())
|
items = list(counts.items())
|
||||||
|
missed_lines = num_lines - sum(counts.values())
|
||||||
|
if missed_lines:
|
||||||
|
# Store the number of lines 'char' was missing from.
|
||||||
|
items.append((0, missed_lines))
|
||||||
if len(items) == 1 and items[0][0] == 0:
|
if len(items) == 1 and items[0][0] == 0:
|
||||||
continue
|
continue
|
||||||
# get the mode of the frequencies
|
# get the mode of the frequencies
|
||||||
|
|
|
||||||
|
|
@ -1437,6 +1437,56 @@ def test_doublequote(self):
|
||||||
dialect = sniffer.sniff(self.sample9)
|
dialect = sniffer.sniff(self.sample9)
|
||||||
self.assertTrue(dialect.doublequote)
|
self.assertTrue(dialect.doublequote)
|
||||||
|
|
||||||
|
def test_guess_delimiter_crlf_not_chosen(self):
|
||||||
|
# Ensure that we pick the real delimiter ("|") over "\r" in a tie.
|
||||||
|
sniffer = csv.Sniffer()
|
||||||
|
sample = "a|b\r\nc|d\r\ne|f\r\n"
|
||||||
|
self.assertEqual(sniffer.sniff(sample).delimiter, "|")
|
||||||
|
self.assertNotEqual(sniffer.sniff(sample).delimiter, "\r")
|
||||||
|
|
||||||
|
def test_zero_mode_tie_order_independence(self):
|
||||||
|
sniffer = csv.Sniffer()
|
||||||
|
# ":" appears in half the rows (1, 0, 1, 0) - a tie between
|
||||||
|
# 0 and 1 per line.
|
||||||
|
# "," appears once every row (true delimiter).
|
||||||
|
#
|
||||||
|
# Even if the zero-frequency bucket is appended vs. inserted, the tie
|
||||||
|
# yields an adjusted score of 0, so ":" should not be promoted and
|
||||||
|
# "," must be selected.
|
||||||
|
sample = (
|
||||||
|
"a,b:c\n"
|
||||||
|
"d,e\n"
|
||||||
|
"f,g:c\n"
|
||||||
|
"h,i\n"
|
||||||
|
)
|
||||||
|
dialect = sniffer.sniff(sample)
|
||||||
|
self.assertEqual(dialect.delimiter, ",")
|
||||||
|
|
||||||
|
def test_zero_mode_tie_order_comma_first(self):
|
||||||
|
sniffer = csv.Sniffer()
|
||||||
|
pattern = (
|
||||||
|
"a,b\n"
|
||||||
|
"c:d\n"
|
||||||
|
"e,f\n"
|
||||||
|
"g:h\n"
|
||||||
|
)
|
||||||
|
sample = pattern * 10
|
||||||
|
with self.assertRaisesRegex(csv.Error, "Could not determine delimiter"):
|
||||||
|
sniffer.sniff(sample)
|
||||||
|
|
||||||
|
def test_zero_mode_tie_order_colon_first(self):
|
||||||
|
sniffer = csv.Sniffer()
|
||||||
|
pattern = (
|
||||||
|
"a:b\n"
|
||||||
|
"c,d\n"
|
||||||
|
"e:f\n"
|
||||||
|
"g,h\n"
|
||||||
|
)
|
||||||
|
sample = pattern * 10
|
||||||
|
with self.assertRaisesRegex(csv.Error, "Could not determine delimiter"):
|
||||||
|
sniffer.sniff(sample)
|
||||||
|
|
||||||
|
|
||||||
class NUL:
|
class NUL:
|
||||||
def write(s, *args):
|
def write(s, *args):
|
||||||
pass
|
pass
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
Speed up :meth:`csv.Sniffer.sniff` delimiter detection by up to 1.6x.
|
||||||
Loading…
Add table
Add a link
Reference in a new issue