From 5fe5f872527b0ae747e1b816ad883f1540c82d8c Mon Sep 17 00:00:00 2001 From: Micah Snyder Date: Wed, 8 Oct 2025 17:02:22 -0400 Subject: [PATCH 1/9] Fix performance issue scanning some Windows executables Scanning CL_TYPE_MSEXE that have embedded file type signature matches for CL_TYPE_MSEXE are incorrectly passing the PE header check for the contained file, resulting in excessive scan times. The problem is that the `peinfo` struct needs to have the `offset` set for the contained `CL_TYPE_MSEXE` match prior to the header check. Without that, the header check was actually validating the PE header of the original file, which would always pass when that's a PE, and would always fail if it's an OLE2 file (the other type which we check for contained PEs). The additional code change in this commit is to make it so the `ctx` parameter must never be NULL, and removing the `map` parameter because, in practice, that is always from `ctx->fmap`. This is to safeguard against future changes to the function that may accidentally use `ctx` without a proper NULL check. CLAM-2882 --- libclamav/pe.c | 21 ++++++++++++--------- libclamav/pe.h | 2 +- libclamav/scanners.c | 4 ++-- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/libclamav/pe.c b/libclamav/pe.c index b899c1b9b..e08e2d9cd 100644 --- a/libclamav/pe.c +++ b/libclamav/pe.c @@ -2781,7 +2781,7 @@ int cli_scanpe(cli_ctx *ctx) cli_exe_info_init(peinfo, 0); - peheader_ret = cli_peheader(map, peinfo, opts, ctx); + peheader_ret = cli_peheader(ctx, peinfo, opts); // Warn the user if PE header parsing failed - if it's a binary that runs // successfully on Windows, we need to relax our PE parsing standards so @@ -4360,7 +4360,7 @@ int cli_scanpe(cli_ctx *ctx) cl_error_t cli_pe_targetinfo(cli_ctx *ctx, struct cli_exe_info *peinfo) { - return cli_peheader(ctx->fmap, peinfo, CLI_PEHEADER_OPT_EXTRACT_VINFO, NULL); + return cli_peheader(ctx, peinfo, CLI_PEHEADER_OPT_EXTRACT_VINFO); } /** Parse the PE header and, if successful, populate peinfo @@ -4420,7 +4420,7 @@ cl_error_t cli_pe_targetinfo(cli_ctx *ctx, struct cli_exe_info *peinfo) * * TODO Same as above but with JSON creation */ -cl_error_t cli_peheader(fmap_t *map, struct cli_exe_info *peinfo, uint32_t opts, cli_ctx *ctx) +cl_error_t cli_peheader(cli_ctx *ctx, struct cli_exe_info *peinfo, uint32_t opts) { cl_error_t ret = CL_ERROR; @@ -4446,17 +4446,20 @@ cl_error_t cli_peheader(fmap_t *map, struct cli_exe_info *peinfo, uint32_t opts, size_t read; uint32_t temp; + fmap_t *map = NULL; + int toval = 0; struct json_object *pe_json = NULL; char jsonbuf[128]; - if (ctx == NULL && - (opts & CLI_PEHEADER_OPT_COLLECT_JSON || - opts & CLI_PEHEADER_OPT_DBG_PRINT_INFO)) { - cli_errmsg("cli_peheader: ctx can't be NULL for options specified\n"); + if (ctx == NULL) { + cli_errmsg("cli_peheader: ctx can't be NULL\n"); + ret = CL_EARG; goto done; } + map = ctx->fmap; + if (opts & CLI_PEHEADER_OPT_COLLECT_JSON) { pe_json = get_pe_property(ctx); } @@ -5469,7 +5472,7 @@ cl_error_t cli_check_auth_header(cli_ctx *ctx, struct cli_exe_info *peinfo) peinfo = &_peinfo; cli_exe_info_init(peinfo, 0); - if (CL_SUCCESS != cli_peheader(ctx->fmap, peinfo, CLI_PEHEADER_OPT_NONE, NULL)) { + if (CL_SUCCESS != cli_peheader(ctx, peinfo, CLI_PEHEADER_OPT_NONE)) { cli_exe_info_destroy(peinfo); return CL_EFORMAT; } @@ -5741,7 +5744,7 @@ cl_error_t cli_genhash_pe(cli_ctx *ctx, unsigned int class, cli_hash_type_t type // if so, use that to avoid having to re-parse the header cli_exe_info_init(peinfo, 0); - if (cli_peheader(ctx->fmap, peinfo, CLI_PEHEADER_OPT_NONE, NULL) != CL_SUCCESS) { + if (cli_peheader(ctx, peinfo, CLI_PEHEADER_OPT_NONE) != CL_SUCCESS) { cli_exe_info_destroy(peinfo); return CL_EFORMAT; } diff --git a/libclamav/pe.h b/libclamav/pe.h index 1cc633222..3a3070df6 100644 --- a/libclamav/pe.h +++ b/libclamav/pe.h @@ -88,7 +88,7 @@ enum { #define CLI_PEHEADER_OPT_REMOVE_MISSING_SECTIONS 0x10 cl_error_t cli_pe_targetinfo(cli_ctx *ctx, struct cli_exe_info *peinfo); -cl_error_t cli_peheader(fmap_t *map, struct cli_exe_info *peinfo, uint32_t opts, cli_ctx *ctx); +cl_error_t cli_peheader(cli_ctx *ctx, struct cli_exe_info *peinfo, uint32_t opts); cl_error_t cli_check_auth_header(cli_ctx *ctx, struct cli_exe_info *peinfo); cl_error_t cli_genhash_pe(cli_ctx *ctx, unsigned int class, cli_hash_type_t type, stats_section_t *hashes); diff --git a/libclamav/scanners.c b/libclamav/scanners.c index fc19cc63e..2cb2cca49 100644 --- a/libclamav/scanners.c +++ b/libclamav/scanners.c @@ -4143,10 +4143,10 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi break; } - cli_exe_info_init(&peinfo, 0); + cli_exe_info_init(&peinfo, fpt->offset); // Header validity check to prevent false positives from being scanned. - ret = cli_peheader(ctx->fmap, &peinfo, CLI_PEHEADER_OPT_NONE, NULL); + ret = cli_peheader(ctx, &peinfo, CLI_PEHEADER_OPT_NONE); // peinfo memory may have been allocated and must be freed even if it failed. cli_exe_info_destroy(&peinfo); From e51a6a8cac354f4e7a709b17f047f1fc709882d8 Mon Sep 17 00:00:00 2001 From: "Val S." Date: Thu, 9 Oct 2025 17:40:14 -0400 Subject: [PATCH 2/9] Fix ZIP parser issue The function which indexes a ZIP central directory is not advancing to the next central directory record thus exceeding the max-files scan limit for many ZIPs. CLAM-2884 --- libclamav/unzip.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libclamav/unzip.c b/libclamav/unzip.c index 90920ec5b..d635522b8 100644 --- a/libclamav/unzip.c +++ b/libclamav/unzip.c @@ -2075,6 +2075,9 @@ cl_error_t unzip_search(cli_ctx *ctx, struct zip_requests *requests) status = CL_ETIMEOUT; goto done; } + + // Increment to the next central file header. + central_file_header_offset += file_record_size; } while ((ret == CL_SUCCESS) && (file_record_size > 0)); } else { cli_dbgmsg("unzip_search: Cannot locate central directory. unzip_search failed.\n"); From b8dc7be7e1e9ff7e09e1baa6de62c496e13a059f Mon Sep 17 00:00:00 2001 From: "Val S." Date: Thu, 9 Oct 2025 20:51:43 -0400 Subject: [PATCH 3/9] Scan performance optimization for TNEF message scans Uncompressed ZIP-based TNEF message attachments, like OOXML office document attachments, get double-extracted because of embedded file type recognition. To prevent excessive scan times, disable embedded file type recognition for TNEF files and relay on TNEF parsing to extract attachments. CLAM-2885 --- libclamav/scanners.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libclamav/scanners.c b/libclamav/scanners.c index 2cb2cca49..f21494545 100644 --- a/libclamav/scanners.c +++ b/libclamav/scanners.c @@ -3650,7 +3650,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi // Omit OLD TAR files because it's a raw archive format that we can extract and scan manually. (type != CL_TYPE_OLD_TAR) && // Omit POSIX TAR files because it's a raw archive format that we can extract and scan manually. - (type != CL_TYPE_POSIX_TAR)) { + (type != CL_TYPE_POSIX_TAR) && + // Omit TNEF files because TNEF message attachments are raw / not compressed. Document and ZIP attachments would be likely to have double-extraction issues. + (type != CL_TYPE_TNEF)) { /* * Enable file type recognition scan mode if requested, except for some problematic types (above). */ From 92af9bfffdc9e2e54b4202667af9924de096acf1 Mon Sep 17 00:00:00 2001 From: "Val S." Date: Thu, 9 Oct 2025 21:27:18 -0400 Subject: [PATCH 4/9] Fix issue recording OOXML document metadata The ZIP single record search feature is used to find specific files when parsing OOXML documents. I observed that the core properties for a PowerPoint file were missing in a test as compared with the previous release. The error handling check for the unzip search returns CL_VIRUS when there is a match, not CL_SUCCESS! CLAM-2886 --- libclamav/unzip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libclamav/unzip.c b/libclamav/unzip.c index d635522b8..a68e305ec 100644 --- a/libclamav/unzip.c +++ b/libclamav/unzip.c @@ -2109,7 +2109,7 @@ cl_error_t unzip_search_single(cli_ctx *ctx, const char *name, size_t nlen, uint // Search for the zip file entry in the current layer. status = unzip_search(ctx, &requests); - if (CL_SUCCESS == status) { + if (CL_VIRUS == status) { *loff = requests.loff; } From 3f417e3d7253d5d834fbc792dc56334ef1d59708 Mon Sep 17 00:00:00 2001 From: "Val S." Date: Fri, 10 Oct 2025 20:32:23 -0400 Subject: [PATCH 5/9] Fix issue detecting VBA projects Previously for documents containing VBA projects, the VBA was treated as an object within the document and not as a normalized version of the document. I apparently switched it say that the VBA is a normalized version of the document. This kind of makes sense in that presently Javascript extracted from HTML is treated as a normalized version of the HTML. But it probably shouldn't. Normalized layers are treated as the same file as the parent. So now those older signatures that match on VBA projects using "Container:CL_TYPE_MSOLE2" are failing to match. So this commit switches it back. VBA project bits written out to a temp file for scanning will be treated as being contained within the document. CLAM-2896 --- libclamav/scanners.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libclamav/scanners.c b/libclamav/scanners.c index f21494545..d432a6117 100644 --- a/libclamav/scanners.c +++ b/libclamav/scanners.c @@ -1665,7 +1665,7 @@ static cl_error_t cli_ole2_tempdir_scan_vba_new(const char *dir, cli_ctx *ctx, s goto done; } - ret = cli_scan_desc(tempfd, ctx, CL_TYPE_SCRIPT, false, NULL, AC_SCAN_VIR, NULL, "extracted-vba-project", tempfile, LAYER_ATTRIBUTES_NORMALIZED); + ret = cli_scan_desc(tempfd, ctx, CL_TYPE_SCRIPT, false, NULL, AC_SCAN_VIR, NULL, "extracted-vba-project", tempfile, LAYER_ATTRIBUTES_NONE); if (CL_SUCCESS != ret) { goto done; } From 85e67f60011b72917a919fa17e280c57cdba8efc Mon Sep 17 00:00:00 2001 From: "Val S." Date: Fri, 10 Oct 2025 22:45:52 -0400 Subject: [PATCH 6/9] Loosen restrictions on embedded file identification In regression testing against a large sample set, I found that strictly disallowing any embedded file identification if any previous layer was an embedded file resulted in missed detections. Specifically, I found an MSEXE file which has an embedded RAR, which in turn had another MSEXE that itself had an embedded 7ZIP containing... malware. sha256: c3cf573fd3d1568348506bf6dd6152d99368a7dc19037d135d5903bc1958ea85 To make it so ClamAV can extract all that, we must loosen the restriction and allow prior layers to be embedded, just not the current layer. I've also added some logic to prevent attempting to extract an object at the same offset twice. The `fpt->offset`s appear in order, but if you have multiple file type magic signatures match on the same address, like maybe you accidentally load two different .ftm files, then you'd get duplicates and double-extraction. As a bonus, I found I could also skip over offsets within a valid ZIP, ARJ, and CAB since we now have the length of those from the header check and as I know we don't want to extract embedded contents in that way. --- libclamav/scanners.c | 94 +++++++++----------------------------------- libclamav/unzip.c | 3 +- 2 files changed, 20 insertions(+), 77 deletions(-) diff --git a/libclamav/scanners.c b/libclamav/scanners.c index d432a6117..980182542 100644 --- a/libclamav/scanners.c +++ b/libclamav/scanners.c @@ -3670,6 +3670,8 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi // In allmatch-mode, ret will never be CL_VIRUS, so ret may be used exclusively for file type detection and for terminal errors. // When not in allmatch-mode, it's more important to return right away if ret is CL_VIRUS, so we don't care if file type matches were found. if (ret >= CL_TYPENO) { + size_t last_offset = 0; + // Matched 1+ file type signatures. Handle them. found_type = (cli_file_t)ret; @@ -3678,11 +3680,16 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi fpt = ftoffset; while (fpt) { - if (fpt->offset > 0) { + if ((fpt->offset > 0) && + // Only handle each offset once to prevent duplicate processing like if two signatures are found at the same offset. + ((size_t)fpt->offset > last_offset)) { + bool type_has_been_handled = true; bool ancestor_was_embedded = false; size_t i; + last_offset = (size_t)fpt->offset; + /* * First, use "embedded type recognition" to identify a file's actual type. * (a.k.a. not embedded files, but file type detection corrections) @@ -3873,84 +3880,10 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi break; } - /* - * Only scan embedded files if we are not already in an embedded context. - * That is, if this or a previous layer was identified with embedded file type recognition, then we do - * not scan for embedded files again. - * - * This restriction will prevent detecting the same embedded content more than once when recursing with - * embedded file type recognition deeper within the same buffer. - * - * This is necessary because we have no way of knowing the length of a file for many formats and cannot - * prevent a search for embedded files from finding the same embedded content multiple times (like a LOT - * of times). - * - * E.g. if the file is like this: - * - * [ data ] [ embedded file ] [ data ] [ embedded file ] - * - * The first time we do it we'll find "two" embedded files, like this: - * - * Emb. File #1: [ embedded file ] [ data ] [ embedded file ] - * Emb. File #2: [ embedded file ] - * - * We must not scan Emb. File #1 again for embedded files, because it would double-extract Emb. File #2. - * - * There is a flaw in this logic, though. Suppose that we actually have: - * - * [ data ] [ compressed file w. recognizable magic bytes ] - * - * A first pass of the above will again identify "two" embedded files: - * - * Emb. File #1: [ compressed archive w. recognizable magic bytes ] - * Emb. File #2: [ magic bytes ] <- Compressed data/Not real file - * - * In this case, the magic bytes of a contained, compressed file is somehow still identifiable despite - * compression. The result is the Emb. File #2 will fail to be parsed and when we decompress Emb. File - * #1, then we maybe get something like this: - * - * Decompressed: [ data ] [ embedded file ] - * - * So if this happened... then we WOULD want to scan the decompressed file for embedded files. - * The problem is, we have no way of knowing how long embedded files are. - * We don't know if we have: - * - * A. [ data ] [ embedded file ] [ data ] [ embedded file ] - * or - * B. [ data ] [ embedded compressed archive w. recognizable magic bytes ] - * or - * C. [ data ] [ embedded uncompressed archive w. multiple file entries [ file 1 ] [ file 2 ] [ file 2 ] ] - * - * Some ideas for a more accurate solution: - * - * 1. Record the offset and size of each file extracted by the parsers. - * Then, when we do embedded file type recognition, we can check if the offset and size of the - * embedded file matches the offset and size of a file that was extracted by a parser. - * This falls apart a little bit for multiple layers of archives unless we also compare offsets within - * each layer. We could do that, but it would be a lot of work. And we'd probably want to take into - * consideration if files were decompressed or decrypted. ... I don't know a clean solution. - * - * 2. Have all parsers to run before embedded file type recognition and they each determine the length - * of the file they parsed, so we can differentiate between embedded files and appended files. - * For appended files, we would know they weren't extracted by a parser module and the parser for - * each of those would report the length of the file it parsed so we can use that to mitigate - * overlapping embedded file type recognition. - * But I highly doubt all file types can be parsed to determine the correct length of the file. - */ - for (i = ctx->recursion_level; i > 0; i--) { - if (ctx->recursion_stack[i].attributes & LAYER_ATTRIBUTES_EMBEDDED) { - // Found an ancestor that was embedded. - // Do not scan embedded files again. - ancestor_was_embedded = true; - break; - } - } - /* * Next, check for actual embedded files. */ - if ((false == ancestor_was_embedded) && - (false == type_has_been_handled)) { + if (false == type_has_been_handled) { cli_dbgmsg("%s signature found at %u\n", cli_ftname(fpt->type), (unsigned int)fpt->offset); type_has_been_handled = true; @@ -4003,6 +3936,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi break; } + // Increment last_offset to ignore any file type matches that occured within this legitimate archive. + last_offset += zip_size - 1; // Note: size is definitely > 0 because header_check succeeded. + nret = cli_magic_scan_nested_fmap_type( ctx->fmap, fpt->offset, @@ -4025,6 +3961,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi break; } + // Increment last_offset to ignore any file type matches that occured within this legitimate archive. + last_offset += cab_size - 1; // Note: size is definitely > 0 because header_check succeeded. + nret = cli_magic_scan_nested_fmap_type( ctx->fmap, fpt->offset, @@ -4048,6 +3987,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi break; } + // Increment last_offset to ignore any file type matches that occured within this legitimate archive. + last_offset += arj_size - 1; // Note: size is definitely > 0 because header_check succeeded. + nret = cli_magic_scan_nested_fmap_type( ctx->fmap, fpt->offset, diff --git a/libclamav/unzip.c b/libclamav/unzip.c index a68e305ec..069ca3796 100644 --- a/libclamav/unzip.c +++ b/libclamav/unzip.c @@ -1310,7 +1310,8 @@ cl_error_t index_local_file_headers_within_bounds( index = *num_records; if (start_offset > fsize || end_offset > fsize || start_offset > end_offset) { - cli_errmsg("index_local_file_headers_within_bounds: Invalid offset arguments\n"); + cli_errmsg("index_local_file_headers_within_bounds: Invalid offset arguments: start_offset=%u, end_offset=%u, fsize=%u\n", + start_offset, end_offset, fsize); status = CL_EPARSE; goto done; } From fa830aabdc9c4dd1796b2dc11f1a3fdaa6a01537 Mon Sep 17 00:00:00 2001 From: "Val S." Date: Sun, 12 Oct 2025 16:05:17 -0400 Subject: [PATCH 7/9] Increase limit for finding PE files embedded in other PE files I am seeing missed detections since we changed to prohibit embedded file type identification when inside an embedded file. In particular, I'm seeing this issue with PE files that contain multiple other MSEXE as well as a variety of false positives for PE file headers. For example, imagine a PE with four concatenated DLL's, like so: ``` [ EXE file | DLL #1 | DLL #2 | DLL #3 | DLL #4 ] ``` And note that false positives for embedded MSEXE files are fairly common. So there may be a few mixed in there. Before limiting embedded file identification we might interpret the file structure something like this: ``` MSEXE: { embedded MSEXE #1: false positive, embedded MSEXE #2: false positive, embedded MSEXE #3: false positive, embedded MSEXE #4: DLL #1: { embedded MSEXE #1: false positive, embedded MSEXE #2: DLL #2: { embedded MSEXE #1: DLL #3: { embedded MSEXE #1: false positive, embedded MSEXE #2: false positive, embedded MSEXE #3: false positive, embedded MSEXE #4: false positive, embedded MSEXE #5: DLL #4 } embedded MSEXE #2: false positive, embedded MSEXE #3: false positive, embedded MSEXE #4: false positive, embedded MSEXE #5: false positive, embedded MSEXE #6: DLL #4 } embedded MSEXE #3: DLL #3, embedded MSEXE #4: false positive, embedded MSEXE #5: false positive, embedded MSEXE #6: false positive, embedded MSEXE #7: false positive, embedded MSEXE #8: DLL #4 } } ``` This is obviously terrible, which is why why we don't allow detecting embedded files within other embedded files. So after we enforce that limit, the same file may be interpreted like this instead: ``` MSEXE: { embedded MSEXE #1: false positive, embedded MSEXE #2: false positive, embedded MSEXE #3: false positive, embedded MSEXE #4: DLL #1, embedded MSEXE #5: false positive, embedded MSEXE #6: DLL #2, embedded MSEXE #7: DLL #3, embedded MSEXE #8: false positive, embedded MSEXE #9: false positive, embedded MSEXE #10: false positive, embedded MSEXE #11: false positive, embedded MSEXE #12: DLL #4 } ``` That's great! Except that we now exceed the "MAX_EMBEDDED_OBJ" limit for embedded type matches (limit 10, but 12 found). That means we won't see or extract the 4th DLL anymore. My solution is to lift the limit when adding an matched MSEXE type. We already do this for matched ZIPSFX types. While doing this, I've significantly tidied up the limits checks to make it more readble, and removed duplicate checks from within the `ac_addtype()` function. CLAM-2897 --- libclamav/matcher-ac.c | 173 ++++++++++++++++++++++++++++++++++------- 1 file changed, 143 insertions(+), 30 deletions(-) diff --git a/libclamav/matcher-ac.c b/libclamav/matcher-ac.c index 77cc331c7..7f05eabb1 100644 --- a/libclamav/matcher-ac.c +++ b/libclamav/matcher-ac.c @@ -833,8 +833,9 @@ int cli_ac_chklsig(const char *expr, const char *end, uint32_t *lsigcnt, unsigne return -1; } - for (i += 2; i + 1 < len && (isdigit(expr[i + 1]) || expr[i + 1] == ','); i++) - ; + for (i += 2; i + 1 < len && (isdigit(expr[i + 1]) || expr[i + 1] == ','); i++) { + continue; + } } if (&expr[i + 1] == rend) @@ -1625,19 +1626,23 @@ void cli_ac_freedata(struct cli_ac_data *data) } } -/* returns only CL_SUCCESS or CL_EMEM */ -inline static int ac_addtype(struct cli_matched_type **list, cli_file_t type, off_t offset, const cli_ctx *ctx) +/** + * @brief Add a match for an object type to the list of matched types. + * + * Important: The caller is responsible for checking limits! + * + * @param list Pointer to the list of matched types. *list may be NULL if no types have been added yet. + * @param type The type of the embedded object. + * @param offset The offset of the embedded object. + * @param ctx The context information. May be NULL. + * @return cl_error_t CL_SUCCESS regardless if added, or CL_EMEM if memory allocation failed. + */ +inline static cl_error_t ac_addtype(struct cli_matched_type **list, cli_file_t type, off_t offset, const cli_ctx *ctx) { - struct cli_matched_type *tnode, *tnode_last; + struct cli_matched_type *tnode; - if (type == CL_TYPE_ZIPSFX) { - if (*list && ctx && ctx->engine->maxfiles && (*list)->cnt > ctx->engine->maxfiles) - return CL_SUCCESS; - } else if (*list && (*list)->cnt >= MAX_EMBEDDED_OBJ) { - return CL_SUCCESS; - } - - if (!(tnode = calloc(1, sizeof(struct cli_matched_type)))) { + tnode = calloc(1, sizeof(struct cli_matched_type)); + if (NULL == tnode) { cli_errmsg("cli_ac_addtype: Can't allocate memory for new type node\n"); return CL_EMEM; } @@ -1645,16 +1650,25 @@ inline static int ac_addtype(struct cli_matched_type **list, cli_file_t type, of tnode->type = type; tnode->offset = offset; - tnode_last = *list; - while (tnode_last && tnode_last->next) - tnode_last = tnode_last->next; + if (*list) { + // Add to end of existing list. + struct cli_matched_type *tnode_last = *list; - if (tnode_last) + while (tnode_last && tnode_last->next) { + tnode_last = tnode_last->next; + } tnode_last->next = tnode; - else + } else { + // First type in the list. *list = tnode; + } (*list)->cnt++; + + if (UNLIKELY(cli_get_debug_flag())) { + cli_dbgmsg("ac_addtype: added %s embedded object at offset " STDi64 ". Embedded object count: %d\n", cli_ftname(type), (uint64_t)offset, (*list)->cnt); + } + return CL_SUCCESS; } @@ -1999,14 +2013,65 @@ cl_error_t cli_ac_scanbuff( cli_dbgmsg("Matched signature for file type %s\n", pt->virname); type = pt->type; - if ((ftoffset != NULL) && - ((*ftoffset == NULL) || (*ftoffset)->cnt < MAX_EMBEDDED_OBJ || type == CL_TYPE_ZIPSFX) && (type >= CL_TYPE_SFX || ((ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP || ftype == CL_TYPE_MSOLE2) && type == CL_TYPE_MSEXE))) { - /* FIXME: the first offset in the array is most likely the correct one but - * it may happen it is not - */ - for (j = 1; j <= CLI_DEFAULT_AC_TRACKLEN + 1 && offmatrix[0][j] != (uint32_t)-1; j++) - if (ac_addtype(ftoffset, type, offmatrix[pt->parts - 1][j], ctx)) - return CL_EMEM; + + if (ftoffset != NULL) { + // Caller provided a pointer to record matched types. + bool too_many_types = false; + bool supported_type = false; + + if (*ftoffset != NULL) { + // Have some type matches already. Check limits. + + if (ctx && ((type == CL_TYPE_ZIPSFX) || + (type == CL_TYPE_MSEXE && ftype == CL_TYPE_MSEXE))) { + // When ctx present, limit the number of type matches using ctx->engine->maxfiles for specific types. + // Reasoning: + // ZIP local file header entries likely to be numerous if a single ZIP appended to the scanned file. + // MSEXE can contain many embedded MSEXE entries and MSEXE type false positives matches. + + if (ctx->engine->maxfiles == 0) { + // Max-files limit is disabled. + } else if ((*ftoffset)->cnt >= ctx->engine->maxfiles) { + if (UNLIKELY(cli_get_debug_flag())) { + cli_dbgmsg("ac_addtype: Can't add %s type at offset " STDu64 " to list of embedded type matches. Reached maxfiles limit of %u\n", cli_ftname(type), (*ftoffset)->offset, ctx->engine->maxfiles); + } + too_many_types = true; + } + } else { + // Limit the number of type matches using MAX_EMBEDDED_OBJ. + if ((*ftoffset)->cnt >= MAX_EMBEDDED_OBJ) { + if (UNLIKELY(cli_get_debug_flag())) { + cli_dbgmsg("ac_addtype: Can't add %s type at offset " STDu64 " to list of embedded type matches. Reached MAX_EMBEDDED_OBJ limit of %u\n", cli_ftname(type), (*ftoffset)->offset, MAX_EMBEDDED_OBJ); + } + too_many_types = true; + } + } + } + + // Filter to supported types. + if ( + // Found type is MBR. + type == CL_TYPE_MBR || + // Found type is any SFX type (i.e., ZIPSFX, RARSFX, 7ZSSFX, etc.). + type >= CL_TYPE_SFX || + // Found type is an MSEXE, but only if host file type is one of MSEXE, ZIP, or MSOLE2. + (type == CL_TYPE_MSEXE && (ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP || ftype == CL_TYPE_MSOLE2))) { + + supported_type = true; + } + + if (supported_type && !too_many_types) { + /* FIXME: the first offset in the array is most likely the correct one but + * it may happen it is not + * Until we're certain and can fix this, we add all offsets in the list. + */ + for (j = 1; j <= CLI_DEFAULT_AC_TRACKLEN + 1 && offmatrix[0][j] != (uint32_t)-1; j++) { + ret = ac_addtype(ftoffset, type, offmatrix[pt->parts - 1][j], ctx); + if (CL_SUCCESS != ret) { + return ret; + } + } + } } memset(offmatrix[0], (uint32_t)-1, pt->parts * (CLI_DEFAULT_AC_TRACKLEN + 2) * sizeof(uint32_t)); @@ -2066,11 +2131,59 @@ cl_error_t cli_ac_scanbuff( cli_dbgmsg("Matched signature for file type %s at %u\n", pt->virname, realoff); type = pt->type; - if ((ftoffset != NULL) && - ((*ftoffset == NULL) || (*ftoffset)->cnt < MAX_EMBEDDED_OBJ || type == CL_TYPE_ZIPSFX) && (type == CL_TYPE_MBR || type >= CL_TYPE_SFX || ((ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP || ftype == CL_TYPE_MSOLE2) && type == CL_TYPE_MSEXE))) { - if (ac_addtype(ftoffset, type, realoff, ctx)) - return CL_EMEM; + if (ftoffset != NULL) { + // Caller provided a pointer to record matched types. + bool too_many_types = false; + bool supported_type = false; + + if (*ftoffset != NULL) { + // Have some type matches already. Check limits. + + if (ctx && ((type == CL_TYPE_ZIPSFX) || + (type == CL_TYPE_MSEXE && ftype == CL_TYPE_MSEXE))) { + // When ctx present, limit the number of type matches using ctx->engine->maxfiles for specific types. + // Reasoning: + // ZIP local file header entries likely to be numerous if a single ZIP appended to the scanned file. + // MSEXE can contain many embedded MSEXE entries and MSEXE type false positives matches. + + if (ctx->engine->maxfiles == 0) { + // Max-files limit is disabled. + } else if ((*ftoffset)->cnt >= ctx->engine->maxfiles) { + if (UNLIKELY(cli_get_debug_flag())) { + cli_dbgmsg("ac_addtype: Can't add %s type at offset " STDu64 " to list of embedded type matches. Reached maxfiles limit of %u\n", cli_ftname(type), (*ftoffset)->offset, ctx->engine->maxfiles); + } + too_many_types = true; + } + } else { + // Limit the number of type matches using MAX_EMBEDDED_OBJ. + if ((*ftoffset)->cnt >= MAX_EMBEDDED_OBJ) { + if (UNLIKELY(cli_get_debug_flag())) { + cli_dbgmsg("ac_addtype: Can't add %s type at offset " STDu64 " to list of embedded type matches. Reached MAX_EMBEDDED_OBJ limit of %u\n", cli_ftname(type), (*ftoffset)->offset, MAX_EMBEDDED_OBJ); + } + too_many_types = true; + } + } + } + + // Filter to supported types. + if ( + // Found type is MBR. + type == CL_TYPE_MBR || + // Found type is any SFX type (i.e., ZIPSFX, RARSFX, 7ZSSFX, etc.). + type >= CL_TYPE_SFX || + // Found type is an MSEXE, but only if host file type is one of MSEXE, ZIP, or MSOLE2. + (type == CL_TYPE_MSEXE && (ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP || ftype == CL_TYPE_MSOLE2))) { + + supported_type = true; + } + + if (supported_type && !too_many_types) { + ret = ac_addtype(ftoffset, type, realoff, ctx); + if (CL_SUCCESS != ret) { + return ret; + } + } } } } else { From 87e389d1b2f3e2b6c0a6b5212581a1d079181a0b Mon Sep 17 00:00:00 2001 From: "Val S." Date: Sun, 12 Oct 2025 16:15:39 -0400 Subject: [PATCH 8/9] Fix compiler warning Mismatched declaration and definition. --- libclamav/matcher.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libclamav/matcher.h b/libclamav/matcher.h index 32581f0de..310451b46 100644 --- a/libclamav/matcher.h +++ b/libclamav/matcher.h @@ -390,6 +390,6 @@ cl_error_t cli_matchmeta(cli_ctx *ctx, const char *fname, size_t fsizec, size_t * - 9 - MachO * @param ctx The current scan context */ -void cli_targetinfo(struct cli_target_info *info, unsigned int target, cli_ctx *ctx); +void cli_targetinfo(struct cli_target_info *info, cli_target_t target, cli_ctx *ctx); #endif From e98b0075e6cb5f7a40e6f19f220b5a55ce733eec Mon Sep 17 00:00:00 2001 From: "Val S." Date: Sun, 12 Oct 2025 19:12:06 -0400 Subject: [PATCH 9/9] Fix ZIP parser issue recording empty file entries If csize (and usize) are 0, like with a directory or other empty file entry, then the functionionality to record file record information when indexing the central directory and each associated file record will neglect to store the `local_header_offset` or `local_header_size`. That causes problems later after sorting the file records and then checking for overlapping files. CLAM-2884 --- libclamav/unzip.c | 63 +++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/libclamav/unzip.c b/libclamav/unzip.c index 069ca3796..e9b912da4 100644 --- a/libclamav/unzip.c +++ b/libclamav/unzip.c @@ -717,17 +717,35 @@ static cl_error_t parse_local_file_header( zip += LOCAL_HEADER_elen; bytes_remaining -= LOCAL_HEADER_elen; - if (!csize) { /* FIXME: what's used for method0 files? csize or usize? Nothing in the specs, needs testing */ - cli_dbgmsg("cli_unzip: local header - skipping empty file\n"); - } else { - if (bytes_remaining < csize) { - cli_dbgmsg("cli_unzip: local header - stream out of file\n"); - status = CL_EPARSE; - goto done; - } + if (bytes_remaining < csize) { + cli_dbgmsg("cli_unzip: local header - stream out of file\n"); + status = CL_EPARSE; + goto done; + } + if (NULL != record) { /* Don't actually unzip if we're just collecting the file record information (offset, sizes) */ - if (NULL == record) { + if (NULL == original_filename) { + record->original_filename = NULL; + } else { + record->original_filename = CLI_STRNDUP(original_filename, strlen(original_filename)); + } + record->local_header_offset = loff; + record->local_header_size = zip - local_header; + record->compressed_size = csize; + record->uncompressed_size = usize; + record->method = LOCAL_HEADER_method; + record->flags = LOCAL_HEADER_flags; + record->encrypted = (LOCAL_HEADER_flags & F_ENCR) ? 1 : 0; + + status = CL_SUCCESS; + } else { + /* + * Unzip or decompress & then unzip. + */ + if (!csize) { /* FIXME: what's used for method0 files? csize or usize? Nothing in the specs, needs testing */ + cli_dbgmsg("cli_unzip: local header - skipping empty file\n"); + } else { zip = fmap_need_ptr_once(ctx->fmap, zip, csize); if (NULL == zip) { cli_dbgmsg("cli_unzip: local header - data out of file\n"); @@ -751,27 +769,12 @@ static cl_error_t parse_local_file_header( goto done; } } - } else { - if (NULL == original_filename) { - record->original_filename = NULL; - } else { - record->original_filename = CLI_STRNDUP(original_filename, strlen(original_filename)); - } - record->local_header_offset = loff; - record->local_header_size = zip - local_header; - record->compressed_size = csize; - record->uncompressed_size = usize; - record->method = LOCAL_HEADER_method; - record->flags = LOCAL_HEADER_flags; - record->encrypted = (LOCAL_HEADER_flags & F_ENCR) ? 1 : 0; - - status = CL_SUCCESS; } - - zip += csize; - bytes_remaining -= csize; } + zip += csize; + bytes_remaining -= csize; + if (LOCAL_HEADER_flags & F_USEDD) { if (bytes_remaining < 12) { cli_dbgmsg("cli_unzip: local header - data desc out of file\n"); @@ -910,8 +913,8 @@ static cl_error_t parse_central_directory_file_header( central_header = fmap_need_off(ctx->fmap, central_file_header_offset, SIZEOF_CENTRAL_HEADER); if (NULL == central_header) { - cli_dbgmsg("cli_unzip: central header - file header offset out of file\n"); - status = CL_EPARSE; + cli_dbgmsg("cli_unzip: central header - reached end of central directory.\n"); + status = CL_BREAK; goto done; } @@ -1311,7 +1314,7 @@ cl_error_t index_local_file_headers_within_bounds( if (start_offset > fsize || end_offset > fsize || start_offset > end_offset) { cli_errmsg("index_local_file_headers_within_bounds: Invalid offset arguments: start_offset=%u, end_offset=%u, fsize=%u\n", - start_offset, end_offset, fsize); + start_offset, end_offset, fsize); status = CL_EPARSE; goto done; }