clamav/unit_tests/clamscan/allmatch_test.py

# Copyright (C) 2020-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved.

"""
Run clamscan tests.
"""

import os
from zipfile import ZIP_DEFLATED, ZipFile
import sys
import hashlib

sys.path.append('../unit_tests')
import testcase


class TC(testcase.TestCase):
    @classmethod
    def setUpClass(cls):
        super(TC, cls).setUpClass()

        # Prepare a directory to store our test databases
        TC.path_db = TC.path_tmp / 'database'
        TC.path_db.mkdir(parents=True)

        (TC.path_db / 'clam.ndb').write_text(
            "Test.NDB:0:*:4b45524e454c33322e444c4c00004578\n"
        )
        (TC.path_db / 'clam.ldb').write_text(
            "Test.LDB;Engine:52-255,Target:1;0;4B45524E454C33322E444C4C00004578697450726F63657373005553455233322E444C4C00434C414D657373616765426F7841\n"
        )
        (TC.path_db / 'clam.hdb').write_text(
            "aa15bcf478d165efd2065190eb473bcb:544:Test.MD5.Hash:73\n"
            "aa15bcf478d165efd2065190eb473bcb:*:Test.MD5.Hash.NoSize:73\n"
        )
        (TC.path_db / 'clam.hsb').write_text(
            "71e7b604d18aefd839e51a39c88df8383bb4c071dc31f87f00a2b5df580d4495:544:Test.Sha256.Hash:73\n"
            "71e7b604d18aefd839e51a39c88df8383bb4c071dc31f87f00a2b5df580d4495:*:Test.Sha256.Hash.NoSize:73\n"
            "62dd70f5e7530e0239901ac186f1f9ae39292561:544:Test.Sha1.Hash:73\n"
            "62dd70f5e7530e0239901ac186f1f9ae39292561:*:Test.Sha1.NoSize:73\n"
        )
        (TC.path_db / 'clam.imp').write_text(
            "98c88d882f01a3f6ac1e5f7dfd761624:39:Test.Import.Hash\n"
            "98c88d882f01a3f6ac1e5f7dfd761624:*:Test.Import.Hash.NoSize\n"
        )
        (TC.path_db / 'clam.mdb').write_text(
            "512:23db1dd3f77fae25610b6a32701313ae:Test.PESection.Hash:73\n"
            "*:23db1dd3f77fae25610b6a32701313ae:Test.PESection.Hash.NoSize:73\n"
        )

    @classmethod
    def tearDownClass(cls):
        super(TC, cls).tearDownClass()

    def setUp(self):
        super(TC, self).setUp()

    def tearDown(self):
        super(TC, self).tearDown()
        self.verify_valgrind_log()

    def test_many_sigs(self):
        self.step_name('Test that each type of sig alerts in all-match mode')

        testfiles = TC.path_build / 'unit_tests' / 'input' / 'clamav_hdb_scanfiles' / 'clam.exe'

        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles} --allmatch'.format(
            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
            path_db=TC.path_db,
            testfiles=testfiles,
        )
        output = self.execute_command(command)

        assert output.ec == 1  # virus

        expected_results = [
            'Test.LDB.UNOFFICIAL FOUND',
            'Test.NDB.UNOFFICIAL FOUND',
            'Test.MD5.Hash.UNOFFICIAL FOUND',
            'Test.MD5.Hash.NoSize.UNOFFICIAL FOUND',
            'Test.Sha1.Hash.UNOFFICIAL FOUND',
            'Test.Sha1.NoSize.UNOFFICIAL FOUND',
            'Test.Sha256.Hash.UNOFFICIAL FOUND',
            'Test.Sha256.Hash.NoSize.UNOFFICIAL FOUND',
            'Test.PESection.Hash.UNOFFICIAL FOUND',
            'Test.PESection.Hash.NoSize.UNOFFICIAL FOUND',
            'Test.Import.Hash.UNOFFICIAL FOUND',
            'Test.Import.Hash.NoSize.UNOFFICIAL FOUND',
        ]
        self.verify_output(output.out, expected=expected_results)

    def test_many_sigs_no_allmatch(self):
        self.step_name('Test that only one sig alerts when not using all-match mode')

        testfiles = TC.path_build / 'unit_tests' / 'input' / 'clamav_hdb_scanfiles' / 'clam.exe'

        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles}'.format(
            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
            path_db=TC.path_db,
            testfiles=testfiles,
        )
        output = self.execute_command(command)

        assert output.ec == 1  # virus

        assert output.out.count('FOUND') == 1 # only finds one of these (order not guaranteed afaik, so don't care which)

    def test_regression_imphash_nosize(self):
        self.step_name('Test an import hash with wildcard size when all-match mode is disabled.')

        db_dir = TC.path_db / 'allmatch-regression-test-sigs'

        os.mkdir(str(db_dir))

        (db_dir / 'clam.imp').write_text(
            "98c88d882f01a3f6ac1e5f7dfd761624:*:Test.Import.Hash.NoSize\n"
        )

        testfiles = TC.path_build / 'unit_tests' / 'input' / 'clamav_hdb_scanfiles' / 'clam.exe'

        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles}'.format(
            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
            path_db=db_dir / 'clam.imp',
            testfiles=testfiles,
        )
        output = self.execute_command(command)

        assert output.ec == 1  # virus

        expected_results = [
            'Test.Import.Hash.NoSize.UNOFFICIAL FOUND',
        ]
        self.verify_output(output.out, expected=expected_results)

    def test_regression_cbc_and_ndb(self):
        self.step_name('Test that bytecode rules will run after content match alerts in all-match mode.')

        # Source for ClamAV-Unit-Test_Signature.cbc
        # ```c
        # VIRUSNAME_PREFIX("BC.Clamav-Unit-Test-Signature")
        # VIRUSNAMES("")
        # TARGET(0)

        # FUNCTIONALITY_LEVEL_MIN(FUNC_LEVEL_096_4)

        # SIGNATURES_DECL_BEGIN
        # DECLARE_SIGNATURE(test_string)
        # SIGNATURES_DECL_END

        # SIGNATURES_DEF_BEGIN
        # /* matches "CLAMAV-TEST-STRING-NOT-EICAR" */
        # DEFINE_SIGNATURE(test_string, "0:434c414d41562d544553542d535452494e472d4e4f542d4549434152")
        # SIGNATURES_DEF_END

        # bool logical_trigger()
        # {
        #     return matches(Signatures.test_string);
        # }

        # int entrypoint(void)
        # {
        #     foundVirus("");
        #     return 0;
        # }
        # ```

        testfile = TC.path_tmp / 'CLAMAV-TEST-STRING-NOT-EICAR'

        (testfile).write_text(
            "CLAMAV-TEST-STRING-NOT-EICAR"
        )

        command = '{valgrind} {valgrind_args} {clamscan} -d {cbc_db} -d {ndb_db} --bytecode-unsigned --allmatch {testfiles}'.format(
            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
            cbc_db=TC.path_source / 'unit_tests' / 'input' / 'bytecode_sigs' / 'Clamav-Unit-Test-Signature.cbc',
            ndb_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb',
            testfiles=testfile,
        )
        output = self.execute_command(command)

        assert output.ec == 1  # virus

        expected_results = [
            'BC.Clamav-Unit-Test-Signature FOUND', # <-- ".UNOFFICIAL" is not added for bytecode signatures
            'NDB.Clamav-Unit-Test-Signature.UNOFFICIAL FOUND',
        ]
        self.verify_output(output.out, expected=expected_results)

    def test_txt_plus_clam_zipsfx(self):
        self.step_name('Test that clam will detect a string in text file, plus identify, extract, and alert on concatenated clam.zip containing clam.exe with a hash sig.')

        testfile = TC.path_tmp / 'test-string-cat-clam.exe.txt'

        clamzip = TC.path_build / 'unit_tests' / 'input' / 'clamav_hdb_scanfiles' / 'clam.zip'

        testfile.write_bytes(b"CLAMAV-TEST-STRING-NOT-EICAR" + clamzip.read_bytes())

        command = '{valgrind} {valgrind_args} {clamscan} -d {clam_exe_db} -d {not_eicar_db} --allmatch {testfiles}'.format(
            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
            clam_exe_db=TC.path_source / 'unit_tests' / 'input' / 'clamav.hdb',
            not_eicar_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb',
            testfiles=testfile,
        )
        output = self.execute_command(command)

        assert output.ec == 1  # virus

        expected_results = [
            'ClamAV-Test-File.UNOFFICIAL FOUND',
            'NDB.Clamav-Unit-Test-Signature.UNOFFICIAL FOUND',
        ]
        self.verify_output(output.out, expected=expected_results)

    def test_exe_imphash_plus_zipsfx(self):
        self.step_name('Test that clam will detect a string in text file, plus identify, extract, and alert on concatenated clam.zip containing clam.exe with an imp-hash sig.')

        # We can't use the hash sig for this clam.exe program because the hash goes out the window when we concatenate on the zip.
        (TC.path_tmp / 'clam.imp').write_text(
            "98c88d882f01a3f6ac1e5f7dfd761624:39:Test.Import.Hash\n"
        )

        # Build a file that is the clam.exe program with a zip concatenated on that contains the not_eicar test string file.
        clam_exe = TC.path_build / 'unit_tests' / 'input' / 'clamav_hdb_scanfiles' / 'clam.exe'

        not_eicar_zip = TC.path_tmp / 'not-eicar.zip'
        with ZipFile(str(not_eicar_zip), 'w', ZIP_DEFLATED) as zf:
            zf.writestr('not-eicar.txt', b"CLAMAV-TEST-STRING-NOT-EICAR")

        testfile = TC.path_tmp / 'clam.exe.not_eicar.zipsfx'
        testfile.write_bytes(clam_exe.read_bytes() + not_eicar_zip.read_bytes())

        command = '{valgrind} {valgrind_args} {clamscan} -d {clam_exe_db} -d {not_eicar_db} --allmatch {testfiles}'.format(
            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
            clam_exe_db=TC.path_tmp / 'clam.imp',
            not_eicar_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb',
            testfiles=testfile,
        )
        output = self.execute_command(command)

        assert output.ec == 1  # virus

        expected_results = [
            'Test.Import.Hash.UNOFFICIAL FOUND',
            'NDB.Clamav-Unit-Test-Signature.UNOFFICIAL FOUND',
        ]
        self.verify_output(output.out, expected=expected_results)

    def test_exe_pattern_plus_zipsfx(self):
        self.step_name('Test that clam will detect a string in text file, plus identify, extract, and alert on concatenated clam.zip containing clam.exe with a pattern-match sig.')
        # This tests a regression where clam will fail to extract the embedded zip file if the pattern-match sig matches before the embedded file type sig.

        # Build a file that is the clam.exe program with a zip concatenated on that contains the not_eicar test string file.
        clam_exe = TC.path_build / 'unit_tests' / 'input' / 'clamav_hdb_scanfiles' / 'clam.exe'

        not_eicar_zip = TC.path_tmp / 'not-eicar.zip'
        with ZipFile(str(not_eicar_zip), 'w', ZIP_DEFLATED) as zf:
            zf.writestr('not-eicar.txt', b"CLAMAV-TEST-STRING-NOT-EICAR")

        testfile = TC.path_tmp / 'clam.exe.not_eicar.zipsfx'
        testfile.write_bytes(clam_exe.read_bytes() + not_eicar_zip.read_bytes())

        command = '{valgrind} {valgrind_args} {clamscan} -d {clam_exe_db} -d {not_eicar_db} --allmatch {testfiles}'.format(
            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
            # We can't use the hash sig for this clam.exe program because the hash goes out the window when we concatenate on the zip.
            clam_exe_db=TC.path_db / 'clam.ndb',
            not_eicar_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb',
            testfiles=testfile,
        )
        output = self.execute_command(command)

        assert output.ec == 1  # virus

        expected_results = [
            'Test.NDB.UNOFFICIAL FOUND',
            'NDB.Clamav-Unit-Test-Signature.UNOFFICIAL FOUND',
        ]
        self.verify_output(output.out, expected=expected_results)

    def test_zip_plus_zip(self):
        self.step_name('Test that clam will the clam.zip and also another zip concatenated to the end.')

        # Build a file that is the clam.zip archive with a zip concatenated on that contains the not_eicar test string file.
        clam_zip = TC.path_build / 'unit_tests' / 'input' / 'clamav_hdb_scanfiles' / 'clam.zip'

        not_eicar_zip = TC.path_tmp / 'not-eicar.zip'
        with ZipFile(str(not_eicar_zip), 'w', ZIP_DEFLATED) as zf:
            zf.writestr('not-eicar.txt', b"CLAMAV-TEST-STRING-NOT-EICAR")

        testfile = TC.path_tmp / 'clam.zip.not_eicar.zipsfx'
        testfile.write_bytes(clam_zip.read_bytes() + not_eicar_zip.read_bytes())

        command = '{valgrind} {valgrind_args} {clamscan} -d {clam_exe_db} -d {not_eicar_db} --allmatch {testfiles}'.format(
            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
            # We can't use the hash sig for this clam.exe program because the hash goes out the window when we concatenate on the zip.
            clam_exe_db=TC.path_db / 'clam.ndb',
            not_eicar_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb',
            testfiles=testfile,
        )
        output = self.execute_command(command)

        assert output.ec == 1  # virus

        expected_results = [
            'Test.NDB.UNOFFICIAL FOUND',
            'NDB.Clamav-Unit-Test-Signature.UNOFFICIAL FOUND',
        ]
        self.verify_output(output.out, expected=expected_results)

    def test_zip_all_files(self):
        self.step_name('Test that clam will extract all files from a zip.')

        testfile = TC.path_tmp / 'multi-file.zip'
        with ZipFile(str(testfile), 'w', ZIP_DEFLATED) as zf:
            zf.writestr('file-0.txt', b"Test file 0")
            zf.writestr('file-1.txt', b"Test file 1")
            zf.writestr('file-2.txt', b"Test file 2")
            zf.writestr('file-3.txt', b"Test file 3")

        # Calculate sha2-256 and len for all files
        sha2_256s = {}
        with ZipFile(str(testfile), 'r') as zf:
            for name in zf.namelist():
                data = zf.read(name)
                sha2_256s[name] = ( hashlib.sha256(data).hexdigest(), len(data) )

        # Make sha2-256 signatures for all files
        with open(TC.path_db / 'missing_entries.hsb', 'w') as f:
            for name, data in sha2_256s.items():
                f.write(f"{data[0]}:{data[1]}:{name}.NDB:73\n")

        command = '{valgrind} {valgrind_args} {clamscan} -d {missing_entries_db} --allmatch {testfiles}'.format(
            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
            # We can't use the hash sig for this clam.exe program because the hash goes out the window when we concatenate on the zip.
            missing_entries_db=TC.path_db / 'missing_entries.hsb',
            testfiles=testfile,
        )
        output = self.execute_command(command)

        assert output.ec == 1  # virus

        expected_results = [
            'file-0.txt.NDB.UNOFFICIAL FOUND',
            'file-1.txt.NDB.UNOFFICIAL FOUND',
            'file-2.txt.NDB.UNOFFICIAL FOUND',
            'file-3.txt.NDB.UNOFFICIAL FOUND',
        ]
        self.verify_output(output.out, expected=expected_results)

    def test_zip_no_central_directory(self):
        self.step_name('Test that clam will extract files from a zip with no central directory.')

        testfile = TC.path_tmp / 'multi-file-no-central.zip'
        with ZipFile(str(testfile), 'w', ZIP_DEFLATED) as zf:
            zf.writestr('file-0.txt', b"Test file 0")
            zf.writestr('file-1.txt', b"Test file 1")
            zf.writestr('file-2.txt', b"Test file 2")
            zf.writestr('file-3.txt', b"Test file 3")

        # Calculate sha2-256 and len for all files
        sha2_256s = {}
        with ZipFile(str(testfile), 'r') as zf:
            for name in zf.namelist():
                data = zf.read(name)
                sha2_256s[name] = ( hashlib.sha256(data).hexdigest(), len(data) )

        # Make sha2-256 signatures for all files
        with open(TC.path_db / 'missing_entries.hsb', 'w') as f:
            for name, data in sha2_256s.items():
                f.write(f"{data[0]}:{data[1]}:{name}.NDB:73\n")

        # Remove the central directory
        with open(str(testfile), 'r+b') as f:
            # find the start of the central directory, which has a 4-byte signature 'PK\x05\x06'
            while f.read(4) != b'PK\x01\x02':
                pass
            # rewind 4 bytes
            f.seek(-4, os.SEEK_CUR)
            # truncate the central directory
            f.truncate()

        command = '{valgrind} {valgrind_args} {clamscan} -d {missing_entries_db} --allmatch {testfiles}'.format(
            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
            # We can't use the hash sig for this clam.exe program because the hash goes out the window when we concatenate on the zip.
            missing_entries_db=TC.path_db / 'missing_entries.hsb',
            testfiles=testfile,
        )
        output = self.execute_command(command)

        assert output.ec == 1  # virus

        expected_results = [
            'file-0.txt.NDB.UNOFFICIAL FOUND',
            'file-1.txt.NDB.UNOFFICIAL FOUND',
            'file-2.txt.NDB.UNOFFICIAL FOUND',
            'file-3.txt.NDB.UNOFFICIAL FOUND',
        ]
        self.verify_output(output.out, expected=expected_results)

    def test_zip_missing_centrals(self):
        self.step_name('Test that clam will detect files omitted from zip central directory.')

        testfile = TC.path_tmp / 'multi-file-missing-centrals.zip'
        with ZipFile(str(testfile), 'w', ZIP_DEFLATED) as zf:
            zf.writestr('file-0.txt', b"Test file 0")
            zf.writestr('file-1.txt', b"Test file 1")
            zf.writestr('file-2.txt', b"Test file 2")
            zf.writestr('file-3.txt', b"Test file 3")

        # Calculate sha2-256 and len for all files
        sha2_256s = {}
        with ZipFile(str(testfile), 'r') as zf:
            for name in zf.namelist():
                data = zf.read(name)
                sha2_256s[name] = ( hashlib.sha256(data).hexdigest(), len(data) )

        # Make sha2-256 signatures for all files
        with open(TC.path_db / 'missing_entries.hsb', 'w') as f:
            for name, data in sha2_256s.items():
                f.write(f"{data[0]}:{data[1]}:{name}.NDB:73\n")

        # Remove the central directory entries for file-2.txt and file-4.txt
        with open(str(testfile), 'r+b') as f:
            # find the first central directory record. Each will have a 4-byte signature 'PK\x01\x02'
            while f.read(4) != b'PK\x01\x02':
                # rewind 3 bytes, because it might not be aligned
                f.seek(-3, os.SEEK_CUR)

            # get the offset
            central_dir_offset = f.tell()

            # read the central directory
            central_dir = f.read()

            # truncate the central directory
            f.truncate(central_dir_offset)

            # seek to the end of the file
            f.seek(0, os.SEEK_END)

            # write just the central directory entries for file-1.txt and file-3.txt
            split_central_dir = central_dir.split(b'PK\x01\x02')
            #f.write(split_central_dir[0])
            f.write(split_central_dir[1])
            #f.write(split_central_dir[2])
            f.write(split_central_dir[3]) # note the last one also has the end of central directory record. That's fine.

        command = '{valgrind} {valgrind_args} {clamscan} -d {missing_entries_db} --allmatch {testfiles}'.format(
            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
            # We can't use the hash sig for this clam.exe program because the hash goes out the window when we concatenate on the zip.
            missing_entries_db=TC.path_db / 'missing_entries.hsb',
            testfiles=testfile,
        )
        output = self.execute_command(command)

        assert output.ec == 1  # virus

        expected_results = [
            'file-0.txt.NDB.UNOFFICIAL FOUND',
            'file-1.txt.NDB.UNOFFICIAL FOUND',
            'file-2.txt.NDB.UNOFFICIAL FOUND',
            'file-3.txt.NDB.UNOFFICIAL FOUND',
        ]
        self.verify_output(output.out, expected=expected_results)

    def test_pe_allmatch(self):
        self.step_name('Test that clam will detect a string in test.exe with a wide variety of signatures written or generated for the file.')

        # The sig set and test.exe for test set was written by one of our threat researchers to test the allmatch option.
        # Overall, it's much more thorough than previous tests, but some of the tests are duplicates of the previous tests.

        # TODO: The section signatures are not working as written, hence the "broken_dbs" directory.
        #       There is a known issue with relative offset signatures when using the Boyer-Moore matcher. The sigs work if using the Aho-Corasick matcher.
        #       When we fix section signatures, we can move them to the alerting sigs directory and update this test.

        test_path = TC.path_source / 'unit_tests' / 'input' / 'pe_allmatch'
        test_exe = test_path / 'test.exe'

        command = '{valgrind} {valgrind_args} {clamscan} \
             -d {alerting_dbs} \
             -d {weak_dbs} \
             -d {broken_dbs} \
             -d {block_cert_dbs} \
             --allmatch --bytecode-unsigned {testfiles}'.format(
            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
            alerting_dbs=test_path / 'alert-sigs',
            block_cert_dbs=test_path / 'block-cert-sigs',
            weak_dbs=test_path / 'weak-sigs',
            broken_dbs=test_path / 'broken-sigs',
            testfiles=test_exe,
        )
        output = self.execute_command(command)

        assert output.ec == 1

        # The alert sig files are all given the signature name, so we can verify that the correct sigs were found.
        # We need only to trim off the extension and say "FOUND" for the alerting sigs.
        # Note: Some of these have ".UNOFFICIAL" in the name because not all of them have that ".UNOFFICIAL" suffix when reported.
        #       I think this is a minor bug. So if we change that, we'll need to update this test.
        expected_results = ['{sig} FOUND'.format(sig=f.stem) for f in (test_path / 'alert-sigs').iterdir()]
        expected_results += ['{sig} FOUND'.format(sig=f.stem) for f in (test_path / 'block-cert-sigs').iterdir()]

        # The broken sig files are all given the signature name, so we can verify that the correct sigs were found.
        # TODO: When we fix section signatures, we can move them to the alerting sigs directory and get rid of this line.
        unexpected_results = ['{sig} FOUND'.format(sig=f.stem) for f in (test_path / 'broken-sigs').iterdir()]

        self.verify_output(output.out, expected=expected_results, unexpected=unexpected_results)