From a1cd8215bea6440365231cf9a8a1b5ea7e38a7f4 Mon Sep 17 00:00:00 2001 From: "Val S." Date: Fri, 10 Oct 2025 20:32:23 -0400 Subject: [PATCH 1/3] Fix issue detecting VBA projects Previously for documents containing VBA projects, the VBA was treated as an object within the document and not as a normalized version of the document. I apparently switched it say that the VBA is a normalized version of the document. This kind of makes sense in that presently Javascript extracted from HTML is treated as a normalized version of the HTML. But it probably shouldn't. Normalized layers are treated as the same file as the parent. So now those older signatures that match on VBA projects using "Container:CL_TYPE_MSOLE2" are failing to match. So this commit switches it back. VBA project bits written out to a temp file for scanning will be treated as being contained within the document. CLAM-2896 --- libclamav/scanners.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libclamav/scanners.c b/libclamav/scanners.c index f21494545..d432a6117 100644 --- a/libclamav/scanners.c +++ b/libclamav/scanners.c @@ -1665,7 +1665,7 @@ static cl_error_t cli_ole2_tempdir_scan_vba_new(const char *dir, cli_ctx *ctx, s goto done; } - ret = cli_scan_desc(tempfd, ctx, CL_TYPE_SCRIPT, false, NULL, AC_SCAN_VIR, NULL, "extracted-vba-project", tempfile, LAYER_ATTRIBUTES_NORMALIZED); + ret = cli_scan_desc(tempfd, ctx, CL_TYPE_SCRIPT, false, NULL, AC_SCAN_VIR, NULL, "extracted-vba-project", tempfile, LAYER_ATTRIBUTES_NONE); if (CL_SUCCESS != ret) { goto done; } From 0cf59435dcd2d35bb27ac7587b3938335bf3dd81 Mon Sep 17 00:00:00 2001 From: "Val S." Date: Fri, 10 Oct 2025 21:59:43 -0400 Subject: [PATCH 2/3] Increase max embedded objects limit from 10 -> 16 By limiting the embedded file recognition in embedded files, we detect fewer embedded files overall. For example, imagine a PE with a structure of embedded files like so: outer pe: emb. file #1: valid pe #1 emb. file #2: valid pe #2 emb. file #3: valid pe #3 emb. file #4: false positive for pe emb. file #5: false positive for pe emb. file #6: false positive for pe emb. file #7: false positive for pe emb. file #8: false positive for pe emb. file #9: false positive for pe emb. file #10: false positive for pe emb. file #10: valid pe #4 With an embedded objects limit of 10, we won't extract that 4th valid PE file. However, previous we allowed detection of embedded files within embedded files, so ClamAV mistook the above structure for something like this: outer pe: emb. file #1: valid pe #1 emb. file #1: valid pe #2 emb. file #1: valid pe #3 emb. file #1: false positive for pe emb. file #2: false positive for pe emb. file #3: false positive for pe emb. file #4: false positive for pe emb. file #5: false positive for pe emb. file #6: false positive for pe emb. file #7: false positive for pe emb. file #8: valid pe #4 As you can see, this is able to find and scan that 4th PE file without exceeding an embedded object limit of 10. The old way of detecting embedded files within embedded files has other drawbacks and is obviously inaccurate in terms of the actual file structure. But it did have that going for it. Anyways, to improve detection, this PR bumps the embedded objects limit to 16. I think that's okay since we've added header checks for several types like PE's, and have also removed the need to drop embedded PE files to a temp file for each scan. CLAM-2897 --- libclamav/filetypes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libclamav/filetypes.h b/libclamav/filetypes.h index 077f68296..4b3e41b69 100644 --- a/libclamav/filetypes.h +++ b/libclamav/filetypes.h @@ -33,7 +33,7 @@ #define MAGIC_BUFFER_SIZE 1028 #define CL_TYPENO 500 -#define MAX_EMBEDDED_OBJ 10 +#define MAX_EMBEDDED_OBJ 16 typedef enum cli_file { CL_TYPE_ANY = 0, From 3c3b5e3dfd17f6adcb57650a292e5816807e93c2 Mon Sep 17 00:00:00 2001 From: "Val S." Date: Fri, 10 Oct 2025 22:45:52 -0400 Subject: [PATCH 3/3] Loosen restrictions on embedded file identification In regression testing against a large sample set, I found that strictly disallowing any embedded file identification if any previous layer was an embedded file resulted in missed detections. Specifically, I found an MSEXE file which has an embedded RAR, which in turn had another MSEXE that itself had an embedded 7ZIP containing... malware. sha256: c3cf573fd3d1568348506bf6dd6152d99368a7dc19037d135d5903bc1958ea85 To make it so ClamAV can extract all that, we must loosen the restriction and allow prior layers to be embedded, just not the current layer. I've also added some logic to prevent attempting to extract an object at the same offset twice. The `fpt->offset`s appear in order, but if you have multiple file type magic signatures match on the same address, like maybe you accidentally load two different .ftm files, then you'd get duplicates and double-extraction. As a bonus, I found I could also skip over offsets within a valid ZIP, ARJ, and CAB since we now have the length of those from the header check and as I know we don't want to extract embedded contents in that way. --- libclamav/scanners.c | 94 +++++++++----------------------------------- libclamav/unzip.c | 3 +- 2 files changed, 20 insertions(+), 77 deletions(-) diff --git a/libclamav/scanners.c b/libclamav/scanners.c index d432a6117..980182542 100644 --- a/libclamav/scanners.c +++ b/libclamav/scanners.c @@ -3670,6 +3670,8 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi // In allmatch-mode, ret will never be CL_VIRUS, so ret may be used exclusively for file type detection and for terminal errors. // When not in allmatch-mode, it's more important to return right away if ret is CL_VIRUS, so we don't care if file type matches were found. if (ret >= CL_TYPENO) { + size_t last_offset = 0; + // Matched 1+ file type signatures. Handle them. found_type = (cli_file_t)ret; @@ -3678,11 +3680,16 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi fpt = ftoffset; while (fpt) { - if (fpt->offset > 0) { + if ((fpt->offset > 0) && + // Only handle each offset once to prevent duplicate processing like if two signatures are found at the same offset. + ((size_t)fpt->offset > last_offset)) { + bool type_has_been_handled = true; bool ancestor_was_embedded = false; size_t i; + last_offset = (size_t)fpt->offset; + /* * First, use "embedded type recognition" to identify a file's actual type. * (a.k.a. not embedded files, but file type detection corrections) @@ -3873,84 +3880,10 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi break; } - /* - * Only scan embedded files if we are not already in an embedded context. - * That is, if this or a previous layer was identified with embedded file type recognition, then we do - * not scan for embedded files again. - * - * This restriction will prevent detecting the same embedded content more than once when recursing with - * embedded file type recognition deeper within the same buffer. - * - * This is necessary because we have no way of knowing the length of a file for many formats and cannot - * prevent a search for embedded files from finding the same embedded content multiple times (like a LOT - * of times). - * - * E.g. if the file is like this: - * - * [ data ] [ embedded file ] [ data ] [ embedded file ] - * - * The first time we do it we'll find "two" embedded files, like this: - * - * Emb. File #1: [ embedded file ] [ data ] [ embedded file ] - * Emb. File #2: [ embedded file ] - * - * We must not scan Emb. File #1 again for embedded files, because it would double-extract Emb. File #2. - * - * There is a flaw in this logic, though. Suppose that we actually have: - * - * [ data ] [ compressed file w. recognizable magic bytes ] - * - * A first pass of the above will again identify "two" embedded files: - * - * Emb. File #1: [ compressed archive w. recognizable magic bytes ] - * Emb. File #2: [ magic bytes ] <- Compressed data/Not real file - * - * In this case, the magic bytes of a contained, compressed file is somehow still identifiable despite - * compression. The result is the Emb. File #2 will fail to be parsed and when we decompress Emb. File - * #1, then we maybe get something like this: - * - * Decompressed: [ data ] [ embedded file ] - * - * So if this happened... then we WOULD want to scan the decompressed file for embedded files. - * The problem is, we have no way of knowing how long embedded files are. - * We don't know if we have: - * - * A. [ data ] [ embedded file ] [ data ] [ embedded file ] - * or - * B. [ data ] [ embedded compressed archive w. recognizable magic bytes ] - * or - * C. [ data ] [ embedded uncompressed archive w. multiple file entries [ file 1 ] [ file 2 ] [ file 2 ] ] - * - * Some ideas for a more accurate solution: - * - * 1. Record the offset and size of each file extracted by the parsers. - * Then, when we do embedded file type recognition, we can check if the offset and size of the - * embedded file matches the offset and size of a file that was extracted by a parser. - * This falls apart a little bit for multiple layers of archives unless we also compare offsets within - * each layer. We could do that, but it would be a lot of work. And we'd probably want to take into - * consideration if files were decompressed or decrypted. ... I don't know a clean solution. - * - * 2. Have all parsers to run before embedded file type recognition and they each determine the length - * of the file they parsed, so we can differentiate between embedded files and appended files. - * For appended files, we would know they weren't extracted by a parser module and the parser for - * each of those would report the length of the file it parsed so we can use that to mitigate - * overlapping embedded file type recognition. - * But I highly doubt all file types can be parsed to determine the correct length of the file. - */ - for (i = ctx->recursion_level; i > 0; i--) { - if (ctx->recursion_stack[i].attributes & LAYER_ATTRIBUTES_EMBEDDED) { - // Found an ancestor that was embedded. - // Do not scan embedded files again. - ancestor_was_embedded = true; - break; - } - } - /* * Next, check for actual embedded files. */ - if ((false == ancestor_was_embedded) && - (false == type_has_been_handled)) { + if (false == type_has_been_handled) { cli_dbgmsg("%s signature found at %u\n", cli_ftname(fpt->type), (unsigned int)fpt->offset); type_has_been_handled = true; @@ -4003,6 +3936,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi break; } + // Increment last_offset to ignore any file type matches that occured within this legitimate archive. + last_offset += zip_size - 1; // Note: size is definitely > 0 because header_check succeeded. + nret = cli_magic_scan_nested_fmap_type( ctx->fmap, fpt->offset, @@ -4025,6 +3961,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi break; } + // Increment last_offset to ignore any file type matches that occured within this legitimate archive. + last_offset += cab_size - 1; // Note: size is definitely > 0 because header_check succeeded. + nret = cli_magic_scan_nested_fmap_type( ctx->fmap, fpt->offset, @@ -4048,6 +3987,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi break; } + // Increment last_offset to ignore any file type matches that occured within this legitimate archive. + last_offset += arj_size - 1; // Note: size is definitely > 0 because header_check succeeded. + nret = cli_magic_scan_nested_fmap_type( ctx->fmap, fpt->offset, diff --git a/libclamav/unzip.c b/libclamav/unzip.c index a68e305ec..069ca3796 100644 --- a/libclamav/unzip.c +++ b/libclamav/unzip.c @@ -1310,7 +1310,8 @@ cl_error_t index_local_file_headers_within_bounds( index = *num_records; if (start_offset > fsize || end_offset > fsize || start_offset > end_offset) { - cli_errmsg("index_local_file_headers_within_bounds: Invalid offset arguments\n"); + cli_errmsg("index_local_file_headers_within_bounds: Invalid offset arguments: start_offset=%u, end_offset=%u, fsize=%u\n", + start_offset, end_offset, fsize); status = CL_EPARSE; goto done; }