mirror of
https://github.com/python/cpython.git
synced 2025-12-08 06:10:17 +00:00
gh-137627: Make csv.Sniffer.sniff() delimiter detection 1.6x faster (#137628)
Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
This commit is contained in:
parent
aa9d0a61d5
commit
6be6f8ff59
4 changed files with 69 additions and 16 deletions
26
Lib/csv.py
26
Lib/csv.py
|
|
@ -362,31 +362,33 @@ def _guess_delimiter(self, data, delimiters):
|
|||
try and evaluate the smallest portion of the data possible, evaluating
|
||||
additional chunks as necessary.
|
||||
"""
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
data = list(filter(None, data.split('\n')))
|
||||
|
||||
ascii = [chr(c) for c in range(127)] # 7-bit ASCII
|
||||
|
||||
# build frequency tables
|
||||
chunkLength = min(10, len(data))
|
||||
iteration = 0
|
||||
charFrequency = {}
|
||||
num_lines = 0
|
||||
# {char -> {count_per_line -> num_lines_with_that_count}}
|
||||
char_frequency = defaultdict(Counter)
|
||||
modes = {}
|
||||
delims = {}
|
||||
start, end = 0, chunkLength
|
||||
while start < len(data):
|
||||
iteration += 1
|
||||
for line in data[start:end]:
|
||||
for char in ascii:
|
||||
metaFrequency = charFrequency.get(char, {})
|
||||
# must count even if frequency is 0
|
||||
freq = line.count(char)
|
||||
# value is the mode
|
||||
metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
|
||||
charFrequency[char] = metaFrequency
|
||||
num_lines += 1
|
||||
for char, count in Counter(line).items():
|
||||
if char.isascii():
|
||||
char_frequency[char][count] += 1
|
||||
|
||||
for char in charFrequency.keys():
|
||||
items = list(charFrequency[char].items())
|
||||
for char, counts in char_frequency.items():
|
||||
items = list(counts.items())
|
||||
missed_lines = num_lines - sum(counts.values())
|
||||
if missed_lines:
|
||||
# Store the number of lines 'char' was missing from.
|
||||
items.append((0, missed_lines))
|
||||
if len(items) == 1 and items[0][0] == 0:
|
||||
continue
|
||||
# get the mode of the frequencies
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue