clamav/unit_tests/clamscan/fp_check_test.py

222 lines
9 KiB
Python
Raw Normal View History

2025-02-14 10:24:30 -05:00
# Copyright (C) 2020-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
"""
Run {valgrind} {valgrind_args} {clamscan} tests.
"""
import unittest
import hashlib
from zipfile import ZIP_DEFLATED, ZipFile
import sys
sys.path.append('../unit_tests')
import testcase
class TC(testcase.TestCase):
@classmethod
def setUpClass(cls):
super(TC, cls).setUpClass()
TC.test_file = TC.path_tmp / "test_file"
2023-04-14 19:15:19 -07:00
with TC.test_file.open('wb') as testfile:
testfile.write(
b"""<?php
IGNORE_user_abort(asdf) scandir(asdfasdfasf]);
foreach(asdfasfs) strpos(asdfasfsfasf) sdfasdfasdf .php.suspected
aasdfasdfsf explode asdasdfasfsf
rename()
<script>sfasfasf</script>
?>
""")
TC.normalized_match_sig = TC.path_tmp / "normalized.ndb"
TC.normalized_match_sig.write_text(r"Malicious.PHP.normalized:0:*:69676e6f72655f757365725f61626f7274286173646629")
TC.original_hash_fp = TC.path_tmp / "original_hash.fp"
TC.original_hash_fp.write_text(r"a4b3c39134fa424beb9f84ffe5f175a3:190:original_hash")
TC.original_hash_wild_fp = TC.path_tmp / "original_hash.wild.fp"
TC.original_hash_wild_fp.write_text(r"a4b3c39134fa424beb9f84ffe5f175a3:*:original_hash.wild:73")
# The normalized hash is this for now. Changes to clamav normalization logic may require
# changes to this hash.
TC.normalized_hash_fp = TC.path_tmp / "normalized_hash.fp"
TC.normalized_hash_fp.write_text(r"0e32a3ab501afb50daedc04764f8dc16:188:normalized_hash")
TC.normalized_hash_wild_fp = TC.path_tmp / "normalized_hash.wild.fp"
TC.normalized_hash_wild_fp.write_text(r"0e32a3ab501afb50daedc04764f8dc16:*:normalized_hash.wild:73")
TC.test_file_zipped = TC.path_tmp / 'test_file.zip'
with ZipFile(str(TC.test_file_zipped), 'w', ZIP_DEFLATED) as zf:
2024-01-19 09:08:36 -08:00
# Add truncated PNG file that will alert with --alert-broken-media
2023-04-14 19:15:19 -07:00
with (TC.path_source / 'logo.png').open('br') as logo_png:
zf.writestr('test_file', b"""<?php
IGNORE_user_abort(asdf) scandir(asdfasdfasf]);
foreach(asdfasfs) strpos(asdfasfsfasf) sdfasdfasdf .php.suspected
aasdfasdfsf explode asdasdfasfsf
rename()
<script>sfasfasf</script>
?>
""")
# Generate hash of the zipped file.
# Since we generated the zip in python, we don't know the hash in advance.
Swap clean cache from MD5 to SHA2-256 Change the clean-cache to use SHA2-256 instead of MD5. Note that all references are changed to specify "SHA2-256" now instead of "SHA256", for clarity. But there is no plan to add support for SHA3 algorithms at this time. Significant code cleanup. E.g.: - Implemented goto-done error handling. - Used `uint8_t *` instead of `unsigned char *`. - Use `bool` for boolean checks, rather than `int. - Used `#defines` instead of magic numbers. - Removed duplicate `#defines` for things like hash length. Add new option to calculate and record additional hash types when the "generate metadata JSON" feature is enabled: - libclamav option: `CL_SCAN_GENERAL_STORE_EXTRA_HASHES` - clamscan option: `--json-store-extra-hashes` (default off) - clamd.conf option: `JsonStoreExtraHashes` (default 'no') Renamed the sigtool option `--sha256` to `--sha2-256`. The original option is still functional, but is deprecated. For the "generate metadata JSON" feature, the file hash is now stored as "sha2-256" instead of "FileMD5". If you enable the "extra hashes" option, then it will also record "md5" and "sha1". Deprecate and disable the internal "SHA collect" feature. This option had been hidden behind C #ifdef checks for an option that wasn't exposed through CMake, so it was basically unavailable anyways. Changes to calculate file hashes when they're needed and no sooner. For the FP feature in the matcher module, I have mimiced the optimization in the FMAP scan routine which makes it so that it can calculate multiple hashes in a single pass of the file. The `HandlerType` feature stores a hash of the file in the scan ctx to prevent retyping the exact same data more than once. I removed that hash field and replaced it with an attribute flag that is applied to the new recursion stack layer when retyping a file. This also closes a minor bug that would prevent retyping a file with an all-zero hash. :) The work upgrading cache.c to support SHA2-256 sized hashes thanks to: https://github.com/m-sola CLAM-255 CLAM-1858 CLAM-1859 CLAM-1860
2025-06-03 19:03:20 -04:00
hash_sha2_256 = hashlib.sha256()
with TC.test_file_zipped.open("rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
Swap clean cache from MD5 to SHA2-256 Change the clean-cache to use SHA2-256 instead of MD5. Note that all references are changed to specify "SHA2-256" now instead of "SHA256", for clarity. But there is no plan to add support for SHA3 algorithms at this time. Significant code cleanup. E.g.: - Implemented goto-done error handling. - Used `uint8_t *` instead of `unsigned char *`. - Use `bool` for boolean checks, rather than `int. - Used `#defines` instead of magic numbers. - Removed duplicate `#defines` for things like hash length. Add new option to calculate and record additional hash types when the "generate metadata JSON" feature is enabled: - libclamav option: `CL_SCAN_GENERAL_STORE_EXTRA_HASHES` - clamscan option: `--json-store-extra-hashes` (default off) - clamd.conf option: `JsonStoreExtraHashes` (default 'no') Renamed the sigtool option `--sha256` to `--sha2-256`. The original option is still functional, but is deprecated. For the "generate metadata JSON" feature, the file hash is now stored as "sha2-256" instead of "FileMD5". If you enable the "extra hashes" option, then it will also record "md5" and "sha1". Deprecate and disable the internal "SHA collect" feature. This option had been hidden behind C #ifdef checks for an option that wasn't exposed through CMake, so it was basically unavailable anyways. Changes to calculate file hashes when they're needed and no sooner. For the FP feature in the matcher module, I have mimiced the optimization in the FMAP scan routine which makes it so that it can calculate multiple hashes in a single pass of the file. The `HandlerType` feature stores a hash of the file in the scan ctx to prevent retyping the exact same data more than once. I removed that hash field and replaced it with an attribute flag that is applied to the new recursion stack layer when retyping a file. This also closes a minor bug that would prevent retyping a file with an all-zero hash. :) The work upgrading cache.c to support SHA2-256 sized hashes thanks to: https://github.com/m-sola CLAM-255 CLAM-1858 CLAM-1859 CLAM-1860
2025-06-03 19:03:20 -04:00
hash_sha2_256.update(chunk)
hash_sha2_256 = hash_sha2_256.hexdigest()
TC.test_file_zipped_hash_fp = TC.path_tmp / 'test_file.zip.hash.fp'
TC.test_file_zipped_hash_fp.write_text('{hash}:{size}:test_file.zip'.format(
Swap clean cache from MD5 to SHA2-256 Change the clean-cache to use SHA2-256 instead of MD5. Note that all references are changed to specify "SHA2-256" now instead of "SHA256", for clarity. But there is no plan to add support for SHA3 algorithms at this time. Significant code cleanup. E.g.: - Implemented goto-done error handling. - Used `uint8_t *` instead of `unsigned char *`. - Use `bool` for boolean checks, rather than `int. - Used `#defines` instead of magic numbers. - Removed duplicate `#defines` for things like hash length. Add new option to calculate and record additional hash types when the "generate metadata JSON" feature is enabled: - libclamav option: `CL_SCAN_GENERAL_STORE_EXTRA_HASHES` - clamscan option: `--json-store-extra-hashes` (default off) - clamd.conf option: `JsonStoreExtraHashes` (default 'no') Renamed the sigtool option `--sha256` to `--sha2-256`. The original option is still functional, but is deprecated. For the "generate metadata JSON" feature, the file hash is now stored as "sha2-256" instead of "FileMD5". If you enable the "extra hashes" option, then it will also record "md5" and "sha1". Deprecate and disable the internal "SHA collect" feature. This option had been hidden behind C #ifdef checks for an option that wasn't exposed through CMake, so it was basically unavailable anyways. Changes to calculate file hashes when they're needed and no sooner. For the FP feature in the matcher module, I have mimiced the optimization in the FMAP scan routine which makes it so that it can calculate multiple hashes in a single pass of the file. The `HandlerType` feature stores a hash of the file in the scan ctx to prevent retyping the exact same data more than once. I removed that hash field and replaced it with an attribute flag that is applied to the new recursion stack layer when retyping a file. This also closes a minor bug that would prevent retyping a file with an all-zero hash. :) The work upgrading cache.c to support SHA2-256 sized hashes thanks to: https://github.com/m-sola CLAM-255 CLAM-1858 CLAM-1859 CLAM-1860
2025-06-03 19:03:20 -04:00
hash=hash_sha2_256,
size=TC.test_file_zipped.stat().st_size))
TC.test_file_zipped_hash_wild_fp = TC.path_tmp / 'test_file.zip.hash.wild.fp'
TC.test_file_zipped_hash_wild_fp.write_text('{hash}:*:test_file.zip.wild:73'.format(
Swap clean cache from MD5 to SHA2-256 Change the clean-cache to use SHA2-256 instead of MD5. Note that all references are changed to specify "SHA2-256" now instead of "SHA256", for clarity. But there is no plan to add support for SHA3 algorithms at this time. Significant code cleanup. E.g.: - Implemented goto-done error handling. - Used `uint8_t *` instead of `unsigned char *`. - Use `bool` for boolean checks, rather than `int. - Used `#defines` instead of magic numbers. - Removed duplicate `#defines` for things like hash length. Add new option to calculate and record additional hash types when the "generate metadata JSON" feature is enabled: - libclamav option: `CL_SCAN_GENERAL_STORE_EXTRA_HASHES` - clamscan option: `--json-store-extra-hashes` (default off) - clamd.conf option: `JsonStoreExtraHashes` (default 'no') Renamed the sigtool option `--sha256` to `--sha2-256`. The original option is still functional, but is deprecated. For the "generate metadata JSON" feature, the file hash is now stored as "sha2-256" instead of "FileMD5". If you enable the "extra hashes" option, then it will also record "md5" and "sha1". Deprecate and disable the internal "SHA collect" feature. This option had been hidden behind C #ifdef checks for an option that wasn't exposed through CMake, so it was basically unavailable anyways. Changes to calculate file hashes when they're needed and no sooner. For the FP feature in the matcher module, I have mimiced the optimization in the FMAP scan routine which makes it so that it can calculate multiple hashes in a single pass of the file. The `HandlerType` feature stores a hash of the file in the scan ctx to prevent retyping the exact same data more than once. I removed that hash field and replaced it with an attribute flag that is applied to the new recursion stack layer when retyping a file. This also closes a minor bug that would prevent retyping a file with an all-zero hash. :) The work upgrading cache.c to support SHA2-256 sized hashes thanks to: https://github.com/m-sola CLAM-255 CLAM-1858 CLAM-1859 CLAM-1860
2025-06-03 19:03:20 -04:00
hash=hash_sha2_256))
@classmethod
def tearDownClass(cls):
super(TC, cls).tearDownClass()
def setUp(self):
super(TC, self).setUp()
def tearDown(self):
super(TC, self).tearDown()
self.verify_valgrind_log()
def test_alerts_on_normalized(self):
"""
This test expects that the normalized pattern match sig without the .fp sig will in fact alert.
"""
self.step_name("Test file detection with pattern from normalized HTML")
output = self.execute_command(
"{valgrind} {valgrind_args} {clamscan} {testfiles} -d {db1}".format(
valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
testfiles=TC.test_file,
db1=TC.normalized_match_sig,
)
)
self.verify_output(output.out, expected=["Malicious.PHP.normalized.UNOFFICIAL FOUND"], unexpected=[])
def test_alerts_on_zip(self):
"""
This test expects that the OG sig without the .fp sig will in fact alert.
"""
self.step_name("Test file detection with pattern from normalized HTML inside a ZIP file")
output = self.execute_command(
"{valgrind} {valgrind_args} {clamscan} {testfiles} -d {db1}".format(
valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
testfiles=TC.test_file_zipped,
db1=TC.normalized_match_sig,
)
)
self.verify_output(output.out, expected=["Malicious.PHP.normalized.UNOFFICIAL FOUND"], unexpected=[])
def test_fp_for_normalized(self):
"""
This test expects that FP sigs for normalized HTML hashes will work,
because hashes are now created when an fmap is created and all embedded
file content to be scanned now gets its own fmap.
"""
self.step_name("Test file trusted with fixed-size hash of the normalized HTML")
output = self.execute_command(
"{valgrind} {valgrind_args} {clamscan} {testfiles} -d {db1} -d {db2} ".format(
valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
testfiles=TC.test_file,
db1=TC.normalized_match_sig,
db2=TC.normalized_hash_fp,
)
)
self.verify_output(output.out, expected=["OK"], unexpected=[])
def test_fp_for_normalized_wild(self):
"""
This test expects that wildcard FP sigs for normalized HTML hashes will work,
because hashes are now created when an fmap is created and all embedded
file content to be scanned now gets its own fmap.
"""
self.step_name("Test file trusted with wild-card hash of the normalized HTML")
output = self.execute_command(
"{valgrind} {valgrind_args} {clamscan} {testfiles} -d {db1} -d {db2} ".format(
valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
testfiles=TC.test_file,
db1=TC.normalized_match_sig,
db2=TC.normalized_hash_wild_fp,
)
)
self.verify_output(output.out, expected=["OK"], unexpected=[])
def test_fp_for_nonnormalized(self):
"""
This test expects that FP sigs for non-normalized HTML hashes will work,
because we now check each hash in the fmap recursion list.
"""
self.step_name("Test file trusted with the original non-normalized fixed-size hash")
output = self.execute_command(
"{valgrind} {valgrind_args} {clamscan} {testfiles} -d {db1} -d {db2}".format(
valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
testfiles=TC.test_file,
db1=TC.normalized_match_sig,
db2=TC.original_hash_fp,
)
)
self.verify_output(output.out, expected=["OK"], unexpected=[])
def test_fp_for_nonnormalized_wild(self):
"""
This test expects that FP sigs for non-normalized HTML hashes will work,
because we now check each hash in the fmap recursion list.
"""
self.step_name("Test file trusted with the original non-normalized wild-card hash")
output = self.execute_command(
"{valgrind} {valgrind_args} {clamscan} {testfiles} -d {db1} -d {db2}".format(
valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
testfiles=TC.test_file,
db1=TC.normalized_match_sig,
db2=TC.original_hash_wild_fp,
)
)
self.verify_output(output.out, expected=["OK"], unexpected=[])
def test_fp_for_zipped_file(self):
"""
This test expects that FP sigs for a zip containing the test file will work.
"""
self.step_name("Test file trusted with fixed-size hash of zip containing test file")
output = self.execute_command(
"{valgrind} {valgrind_args} {clamscan} {testfiles} -d {db1} -d {db2}".format(
valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
testfiles=TC.test_file_zipped,
db1=TC.normalized_match_sig,
db2=TC.test_file_zipped_hash_fp,
)
)
self.verify_output(output.out, expected=["OK"], unexpected=[])
def test_fp_for_zipped_file_wild(self):
"""
This test expects that FP sigs for a zip containing the test file will work.
"""
self.step_name("Test file trusted with wildcard hash of zip containing test file")
output = self.execute_command(
"{valgrind} {valgrind_args} {clamscan} {testfiles} -d {db1} -d {db2}".format(
valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
testfiles=TC.test_file_zipped,
db1=TC.normalized_match_sig,
db2=TC.test_file_zipped_hash_wild_fp,
)
)
self.verify_output(output.out, expected=["OK"], unexpected=[])