gh-137627: Make csv.Sniffer.sniff() delimiter detection 1.6x faster (#137628)

Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
2025-12-08 06:10:17 +00:00 · 2025-10-23 14:28:29 +02:00 · 2025-10-23 14:28:29 +02:00 · 6be6f8ff59
commit 6be6f8ff59
parent aa9d0a61d5
4 changed files with 69 additions and 16 deletions
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@ -652,11 +652,11 @@ zlib
 Optimizations
 =============
-module_name
+csv
-----------
+---
 * TODO
 * :meth:`csv.Sniffer.sniff` delimiter detection is now up to 1.6x faster.
  (Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.)
 Removed
--- a/Lib/csv.py
+++ b/Lib/csv.py
@ -362,31 +362,33 @@ def _guess_delimiter(self, data, delimiters):
        try and evaluate the smallest portion of the data possible, evaluating
        additional chunks as necessary.
        """
        from collections import Counter, defaultdict
        data = list(filter(None, data.split('\n')))
        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
        # build frequency tables
        chunkLength = min(10, len(data))
        iteration = 0
-        charFrequency = {}
+        num_lines = 0
        # {char -> {count_per_line -> num_lines_with_that_count}}
        char_frequency = defaultdict(Counter)
        modes = {}
        delims = {}
        start, end = 0, chunkLength
        while start < len(data):
            iteration += 1
            for line in data[start:end]:
-                for char in ascii:
+                num_lines += 1
-                    metaFrequency = charFrequency.get(char, {})
+                for char, count in Counter(line).items():
-                    # must count even if frequency is 0
+                    if char.isascii():
-                    freq = line.count(char)
+                        char_frequency[char][count] += 1
                    # value is the mode
                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
                    charFrequency[char] = metaFrequency
-            for char in charFrequency.keys():
+            for char, counts in char_frequency.items():
-                items = list(charFrequency[char].items())
+                items = list(counts.items())
                missed_lines = num_lines - sum(counts.values())
                if missed_lines:
                    # Store the number of lines 'char' was missing from.
                    items.append((0, missed_lines))
                if len(items) == 1 and items[0][0] == 0:
                    continue
                # get the mode of the frequencies
--- a/Lib/test/test_csv.py
+++ b/Lib/test/test_csv.py
@ -1437,6 +1437,56 @@ def test_doublequote(self):
        dialect = sniffer.sniff(self.sample9)
        self.assertTrue(dialect.doublequote)
    def test_guess_delimiter_crlf_not_chosen(self):
        # Ensure that we pick the real delimiter ("|") over "\r" in a tie.
        sniffer = csv.Sniffer()
        sample = "a|b\r\nc|d\r\ne|f\r\n"
        self.assertEqual(sniffer.sniff(sample).delimiter, "|")
        self.assertNotEqual(sniffer.sniff(sample).delimiter, "\r")
    def test_zero_mode_tie_order_independence(self):
        sniffer = csv.Sniffer()
        # ":" appears in half the rows (1, 0, 1, 0) - a tie between
        #     0 and 1 per line.
        # "," appears once every row (true delimiter).
        #
        # Even if the zero-frequency bucket is appended vs. inserted, the tie
        # yields an adjusted score of 0, so ":" should not be promoted and
        # "," must be selected.
        sample = (
            "a,b:c\n"
            "d,e\n"
            "f,g:c\n"
            "h,i\n"
        )
        dialect = sniffer.sniff(sample)
        self.assertEqual(dialect.delimiter, ",")
    def test_zero_mode_tie_order_comma_first(self):
        sniffer = csv.Sniffer()
        pattern = (
            "a,b\n"
            "c:d\n"
            "e,f\n"
            "g:h\n"
        )
        sample = pattern * 10
        with self.assertRaisesRegex(csv.Error, "Could not determine delimiter"):
            sniffer.sniff(sample)
    def test_zero_mode_tie_order_colon_first(self):
        sniffer = csv.Sniffer()
        pattern = (
            "a:b\n"
            "c,d\n"
            "e:f\n"
            "g,h\n"
        )
        sample = pattern * 10
        with self.assertRaisesRegex(csv.Error, "Could not determine delimiter"):
            sniffer.sniff(sample)
 class NUL:
    def write(s, *args):
        pass
--- a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
+++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
@ -0,0 +1 @@
 Speed up :meth:`csv.Sniffer.sniff` delimiter detection by up to 1.6x.
		`@ -0,0 +1 @@`
							Speed up :meth:`csv.Sniffer.sniff` delimiter detection by up to 1.6x.