Reduce unnecessary scanning of embedded file FPs (#1571)

When embedded file type recognition finds a possible embedded file, it is being scanned as a new embedded file even if it turns out it was a false positive and parsing fails. My solution is to pre-parse the file headers as little possible to determine if it is valid. If possible, also determine the file size based on the headers. That will make it so we don't have to scan additional data when the embedded file is not at the very end. This commit adds header checks prior to embedded ZIP, ARJ, and CAB scanning. For these types I was also able to use the header checks to determine the object size so as to prevent excessive pattern matching. TODO: Add the same for RAR, EGG, 7Z, NULSFT, AUTOIT, IShield, and PDF. This commit also removes duplicate matching for embedded MSEXE. The embedded MSEXE detection and scanning logic was accidentally creating an extra duplicate layer in between scanning and detection because of the logic within the `cli_scanembpe()` function. That function was effectively doing the header check which this commit adds for ZIP, ARJ, and CAB but minus the size check. Note: It is unfortunately not possible to get an accurage size from PE file headers. The `cli_scanembpe()` function also used to dump to a temp file for no reason since FMAPs were extended to support windows into other FMAPs. So this commit removes the intermediate layer as well as dropping a temp file for each embedded PE file. Further, this commit adds configuration and DCONF safeguards around all embedded file type scanning. Finally, this commit adds a set of tests to validate proper extraction of embedded ZIP, ARJ, CAB, and MSEXE files. CLAM-2862 Co-authored-by: TheRaynMan <draynor@sourcefire.com>
2025-12-08 06:09:46 +00:00 · 2025-09-23 15:57:28 -04:00 · 2025-09-23 15:57:28 -04:00 · a77a271fb5
commit a77a271fb5
parent 1d158c13d4
28 changed files with 618 additions and 228 deletions
--- a/libclamav/libmspack.c
+++ b/libclamav/libmspack.c
@ -133,7 +133,7 @@ static void mspack_fmap_close(struct mspack_file *file)
 static int mspack_fmap_read(struct mspack_file *file, void *buffer, int bytes)
 {
    struct mspack_handle *mspack_handle = (struct mspack_handle *)file;
-    off_t offset;
+    size_t offset;
    size_t count;
    int ret;

@ -150,7 +150,7 @@ static int mspack_fmap_read(struct mspack_file *file, void *buffer, int bytes)
        /* Use fmap */
        offset = mspack_handle->offset + mspack_handle->org;

-        count = fmap_readn(mspack_handle->fmap, buffer, (size_t)offset, (size_t)bytes);
+        count = fmap_readn(mspack_handle->fmap, buffer, offset, (size_t)bytes);
        if (count == (size_t)-1) {
            cli_dbgmsg("%s() %d requested %d bytes, read failed (-1)\n", __func__, __LINE__, bytes);
            return -1;
@ -163,7 +163,7 @@ static int mspack_fmap_read(struct mspack_file *file, void *buffer, int bytes)
        return (int)count;
    } else {
        /* Use file descriptor */
-        count = fread(buffer, bytes, 1, mspack_handle->f);
+        count = fread(buffer, (size_t)bytes, 1, mspack_handle->f);
        if (count < 1) {
            cli_dbgmsg("%s() %d requested %d bytes, read failed (%zu)\n", __func__, __LINE__, bytes, count);
            return -1;
@ -340,18 +340,83 @@ static struct mspack_system mspack_sys_fmap_ops = {
    .copy    = mspack_fmap_copy,
 };

-cl_error_t cli_scanmscab(cli_ctx *ctx, off_t sfx_offset)
+cl_error_t cli_mscab_header_check(cli_ctx *ctx, size_t offset, size_t *size)
+{
+    cl_error_t status = CL_EFORMAT;
+
+    struct mscab_decompressor *cab_d = NULL;
+    struct mscabd_cabinet *cab_h     = NULL;
+    struct mspack_name mspack_fmap   = {0};
+    struct mspack_system_ex ops_ex   = {0};
+
+    if (NULL == ctx || NULL == size) {
+        cli_dbgmsg("%s() invalid argument\n", __func__);
+        status = CL_EARG;
+        goto done;
+    }
+
+    *size            = 0;
+    mspack_fmap.fmap = ctx->fmap;
+
+    if (offset > INT32_MAX) {
+        cli_dbgmsg("%s() offset too large %zu\n", __func__, offset);
+        status = CL_EFORMAT;
+        goto done;
+    }
+
+    mspack_fmap.org = (off_t)offset;
+
+    ops_ex.ops = mspack_sys_fmap_ops;
+
+    cab_d = mspack_create_cab_decompressor(&ops_ex.ops);
+    if (NULL == cab_d) {
+        cli_dbgmsg("%s() failed at %d\n", __func__, __LINE__);
+        status = CL_EUNPACK;
+        goto done;
+    }
+
+    cab_h = cab_d->open(cab_d, (char *)&mspack_fmap);
+    if (NULL == cab_h) {
+        cli_dbgmsg("%s() failed at %d\n", __func__, __LINE__);
+        status = CL_EFORMAT;
+        goto done;
+    }
+
+    *size = (size_t)cab_h->length;
+
+    cli_dbgmsg("%s(): Successfully read CAB header for CAB of size %zu\n", __func__, *size);
+    status = CL_SUCCESS;
+
+done:
+    if (NULL != cab_d) {
+        if (NULL != cab_h) {
+            cab_d->close(cab_d, cab_h);
+        }
+        mspack_destroy_cab_decompressor(cab_d);
+    }
+
+    return status;
+}
+
+cl_error_t cli_scanmscab(cli_ctx *ctx, size_t sfx_offset)
 {
    cl_error_t ret                   = CL_SUCCESS;
    struct mscab_decompressor *cab_d = NULL;
    struct mscabd_cabinet *cab_h     = NULL;
    struct mscabd_file *cab_f        = NULL;
    int files;
-    struct mspack_name mspack_fmap = {
-        .fmap = ctx->fmap,
-        .org  = sfx_offset,
-    };
-    struct mspack_system_ex ops_ex;
+    struct mspack_name mspack_fmap = {0};
+    struct mspack_system_ex ops_ex = {0};
+
+    mspack_fmap.fmap = ctx->fmap;
+
+    if (sfx_offset > INT32_MAX) {
+        cli_dbgmsg("%s() offset too large %zu\n", __func__, sfx_offset);
+        ret = CL_EFORMAT;
+        goto done;
+    }
+
+    mspack_fmap.org = (off_t)sfx_offset;

    char *tmp_fname      = NULL;
    bool tempfile_exists = false;
--- a/libclamav/libmspack.h
+++ b/libclamav/libmspack.h
@ -10,7 +10,31 @@
 #ifndef __LIBMSPACK_H__
 #define __LIBMSPACK_H__

-int cli_scanmscab(cli_ctx *ctx, off_t sfx_offset);
-int cli_scanmschm(cli_ctx *ctx);
+/**
+ * @brief Check the CAB header for validity.
+ *
+ * @param fmap          The fmap containing the CAB file.
+ * @param offset        Offset of the start of a CAB file within the current fmap.
+ * @param size          The size of the CAB file.
+ * @return cl_error_t
+ */
+cl_error_t cli_mscab_header_check(cli_ctx *ctx, size_t offset, size_t *size);
+
+/**
+ * @brief Open and extract a Microsoft CAB file, scanning each extracted file.
+ *
+ * @param ctx           Scan context
+ * @param sfx_offset    Offset of the start of a CAB file within the current fmap.
+ * @return cl_error_t   CL_SUCCESS on success, or an error code on failure.
+ */
+cl_error_t cli_scanmscab(cli_ctx *ctx, size_t sfx_offset);
+
+/**
+ * @brief Open and extract a Microsoft CHM file, scanning each extracted file.
+ *
+ * @param ctx           Scan context
+ * @return cl_error_t   CL_SUCCESS on success, or an error code on failure.
+ */
+cl_error_t cli_scanmschm(cli_ctx *ctx);

 #endif
--- a/libclamav/matcher.c
+++ b/libclamav/matcher.c
@ -602,7 +602,10 @@ cl_error_t cli_check_fp(cli_ctx *ctx, const char *vname)

        need_hash[CLI_HASH_SHA2_256] = cli_hm_have_size(ctx->engine->hm_fp, CLI_HASH_SHA2_256, map->len) ||
                                       cli_hm_have_wild(ctx->engine->hm_fp, CLI_HASH_SHA2_256) ||
-                                       cli_hm_have_size(ctx->engine->hm_fp, CLI_HASH_SHA2_256, 1);
+                                       cli_hm_have_size(ctx->engine->hm_fp, CLI_HASH_SHA2_256, 1) ||
+                                       // If debug logging is enabled, we want to calculate SHA256 hashes for all layers.
+                                       // Some users rely on the debug log output to create new FP signatures.
+                                       cli_debug_flag;

        /* Set fmap to need hash later if required.
         * This is an optimization so we can calculate all needed hashes in one pass. */
@ -629,13 +632,14 @@ cl_error_t cli_check_fp(cli_ctx *ctx, const char *vname)
                    goto done;
                }

+                if (cli_debug_flag ||
+                    ((CLI_HASH_MD5 == hash_type) && (ctx->engine->cb_hash))) {
                    /* Convert hash to string */
                    for (i = 0; i < hash_len; i++) {
                        sprintf(hash_string + i * 2, "%02x", hash[i]);
                    }
                    hash_string[hash_len * 2] = 0;

-                if (cli_debug_flag || ctx->engine->cb_hash) {
                    const char *name = ctx->recursion_stack[stack_index].fmap->name;
                    const char *type = cli_ftname(ctx->recursion_stack[stack_index].type);

--- a/libclamav/matcher.h
+++ b/libclamav/matcher.h
@ -360,7 +360,7 @@ cl_error_t cli_scan_fmap(cli_ctx *ctx, cli_file_t ftype, bool filetype_only, str
 */
 cl_error_t cli_exp_eval(cli_ctx *ctx, struct cli_matcher *root, struct cli_ac_data *acdata, struct cli_target_info *target_info);

-cl_error_t cli_caloff(const char *offstr, const struct cli_target_info *info, unsigned int target, uint32_t *offdata, uint32_t *offset_min, uint32_t *offset_max);
+cl_error_t cli_caloff(const char *offstr, const struct cli_target_info *info, cli_target_t target, uint32_t *offdata, uint32_t *offset_min, uint32_t *offset_max);

 /**
 * @brief Determine if an alert is a known false positive, using each fmap in the ctx->container stack to check MD5, SHA1, and SHA2-256 hashes.
--- a/libclamav/pe.c
+++ b/libclamav/pe.c
@ -4658,10 +4658,7 @@ cl_error_t cli_peheader(fmap_t *map, struct cli_exe_info *peinfo, uint32_t opts,
            pe_add_heuristic_property(ctx, "BadNumberOfSections");
        }

-        // TODO Investigate how corrupted_input is set and whether this
-        // check is needed
-        if (opts & CLI_PEHEADER_OPT_DBG_PRINT_INFO &&
-            !ctx->corrupted_input) {
+        if ((opts & CLI_PEHEADER_OPT_DBG_PRINT_INFO) && !ctx->corrupted_input) {
            if (peinfo->nsections == 0) {
                cli_dbgmsg("cli_peheader: Invalid NumberOfSections (0)\n");
            }
--- a/libclamav/scanners.c
+++ b/libclamav/scanners.c
@ -997,7 +997,7 @@ static cl_error_t cli_scanarj(cli_ctx *ctx)
    do {
        metadata.filename = NULL;

-        ret = cli_unarj_prepare_file(dir, &metadata);
+        ret = cli_unarj_prepare_file(&metadata);
        if (ret != CL_SUCCESS) {
            cli_dbgmsg("ARJ: cli_unarj_prepare_file Error: %s\n", cl_strerror(ret));
            break;
@ -3447,94 +3447,6 @@ static cl_error_t cli_scan_structured(cli_ctx *ctx)
    return CL_SUCCESS;
 }

-static cl_error_t cli_scanembpe(cli_ctx *ctx, off_t offset)
-{
-    cl_error_t ret = CL_SUCCESS;
-    int fd;
-    size_t bytes;
-    size_t size = 0;
-    size_t todo;
-    const char *buff;
-    char *tmpname;
-    fmap_t *map = ctx->fmap;
-    unsigned int corrupted_input;
-
-    tmpname = cli_gentemp_with_prefix(ctx->this_layer_tmpdir, "embedded-pe");
-    if (!tmpname)
-        return CL_EMEM;
-
-    if ((fd = open(tmpname, O_RDWR | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR)) < 0) {
-        cli_errmsg("cli_scanembpe: Can't create file %s\n", tmpname);
-        free(tmpname);
-        return CL_ECREAT;
-    }
-
-    todo = map->len - offset;
-    while (1) {
-        bytes = MIN(todo, map->pgsz);
-        if (!bytes)
-            break;
-
-        if (!(buff = fmap_need_off_once(map, offset + size, bytes))) {
-            close(fd);
-            if (!ctx->engine->keeptmp) {
-                if (cli_unlink(tmpname)) {
-                    free(tmpname);
-                    return CL_EUNLINK;
-                }
-            }
-            free(tmpname);
-            return CL_EREAD;
-        }
-        size += bytes;
-        todo -= bytes;
-
-        if (cli_checklimits("cli_scanembpe", ctx, size, 0, 0) != CL_SUCCESS)
-            break;
-
-        if (cli_writen(fd, buff, bytes) != bytes) {
-            cli_dbgmsg("cli_scanembpe: Can't write to temporary file\n");
-            close(fd);
-            if (!ctx->engine->keeptmp) {
-                if (cli_unlink(tmpname)) {
-                    free(tmpname);
-                    return CL_EUNLINK;
-                }
-            }
-            free(tmpname);
-            return CL_EWRITE;
-        }
-    }
-
-    // Setting ctx->corrupted_input will prevent the PE parser from reporting "broken executable" for unpacked/reconstructed files that may not be 100% to spec.
-    corrupted_input      = ctx->corrupted_input;
-    ctx->corrupted_input = 1;
-    ret                  = cli_magic_scan_desc(fd, tmpname, ctx, NULL, LAYER_ATTRIBUTES_NONE);
-    ctx->corrupted_input = corrupted_input;
-    if (ret != CL_SUCCESS) {
-        close(fd);
-        if (!ctx->engine->keeptmp) {
-            if (cli_unlink(tmpname)) {
-                free(tmpname);
-                return CL_EUNLINK;
-            }
-        }
-        free(tmpname);
-        return ret;
-    }
-
-    close(fd);
-    if (!ctx->engine->keeptmp) {
-        if (cli_unlink(tmpname)) {
-            free(tmpname);
-            return CL_EUNLINK;
-        }
-    }
-    free(tmpname);
-
-    return CL_SUCCESS;
-}
-
 #if defined(_WIN32) || defined(C_LINUX) || defined(C_DARWIN)
 #define PERF_MEASURE
 #endif
@ -3720,26 +3632,33 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi
    cli_file_t found_type;

    if ((typercg) &&
-        // We should also omit bzips, but DMG's may be detected in bzips. (type != CL_TYPE_BZ) &&        /* Omit BZ files because they can contain portions of original files like zip file entries that cause invalid extractions and lots of warnings. Decompress first, then scan! */
-        (type != CL_TYPE_GZ) &&         /* Omit GZ files because they can contain portions of original files like zip file entries that cause invalid extractions and lots of warnings. Decompress first, then scan! */
-        (type != CL_TYPE_CPIO_OLD) &&   /* Omit CPIO_OLD files because it's an image format that we can extract and scan manually. */
-        (type != CL_TYPE_ZIP) &&        /* Omit ZIP files because it'll detect each zip file entry as SFXZIP, which is a waste. We'll extract it and then scan. */
-        (type != CL_TYPE_ZIPSFX) &&     /* Omit SFX archive types from being checked for embedded content. They should only be parsed for contained files. Those contained files could be EXE's with more SFX, but that's the nature of containers. */
-        (type != CL_TYPE_ARJSFX) &&     /* " */
-        (type != CL_TYPE_RARSFX) &&     /* " */
-        (type != CL_TYPE_EGGSFX) &&     /* " */
-        (type != CL_TYPE_CABSFX) &&     /* " */
-        (type != CL_TYPE_7ZSFX) &&      /* " */
-        (type != CL_TYPE_OOXML_WORD) && /* Omit OOXML because they are ZIP-based and file-type scanning will double-extract their contents. */
-        (type != CL_TYPE_OOXML_PPT) &&  /* " */
-        (type != CL_TYPE_OOXML_XL) &&   /* " */
-        (type != CL_TYPE_OOXML_HWP) &&  /* " */
-        (type != CL_TYPE_OLD_TAR) &&    /* Omit OLD TAR files because it's a raw archive format that we can extract and scan manually. */
-        (type != CL_TYPE_POSIX_TAR)) {  /* Omit POSIX TAR files because it's a raw archive format that we can extract and scan manually. */
+        // Omit embedded files or file types already identified via this process.
+        (!(ctx->recursion_stack[ctx->recursion_level].attributes & LAYER_ATTRIBUTES_EMBEDDED)) &&
+        // Omit GZ files because they can contain portions of original files like zip file entries that cause invalid extractions and lots of warnings. Decompress first, then scan!
+        (type != CL_TYPE_GZ) &&
+        // We should also omit bzips, but DMG's may be detected in bzips.
+        //(type != CL_TYPE_BZ) &&
+        // Omit CPIO_OLD files because it's an image format that we can extract and scan manually.
+        (type != CL_TYPE_CPIO_OLD) &&
+        // Omit ZIP files because it'll detect each zip file entry as SFXZIP, which is a waste. We'll extract it and then scan.
+        (type != CL_TYPE_ZIP) &&
+        // Omit OOXML because they are ZIP-based and file-type scanning will double-extract their contents.
+        (type != CL_TYPE_OOXML_WORD) &&
+        (type != CL_TYPE_OOXML_PPT) &&
+        (type != CL_TYPE_OOXML_XL) &&
+        (type != CL_TYPE_OOXML_HWP) &&
+        // Omit OLD TAR files because it's a raw archive format that we can extract and scan manually.
+        (type != CL_TYPE_OLD_TAR) &&
+        // Omit POSIX TAR files because it's a raw archive format that we can extract and scan manually.
+        (type != CL_TYPE_POSIX_TAR)) {
        /*
         * Enable file type recognition scan mode if requested, except for some problematic types (above).
         */
        acmode |= AC_SCAN_FT;
+    } else {
+        cli_dbgmsg("scanraw: embedded type recognition disabled or not applicable for type %s %s\n",
+                   cli_ftname(type),
+                   (ctx->recursion_stack[ctx->recursion_level].attributes & LAYER_ATTRIBUTES_EMBEDDED) ? "(embedded layer)" : "");
    }

    perf_start(ctx, PERFT_RAW);
@ -3960,8 +3879,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi
                 * This restriction will prevent detecting the same embedded content more than once when recursing with
                 * embedded file type recognition deeper within the same buffer.
                 *
-                 * This is necessary because we have no way of knowing the length of a file and cannot prevent a search
-                 * for embedded files from finding the same embedded content multiple times (like a LOT of times).
+                 * This is necessary because we have no way of knowing the length of a file for many formats and cannot
+                 * prevent a search for embedded files from finding the same embedded content multiple times (like a LOT
+                 * of times).
                 *
                 * E.g. if the file is like this:
                 *
@ -3990,7 +3910,7 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi
                 *  Decompressed:    [ data                   ] [ embedded file ]
                 *
                 * So if this happened... then we WOULD want to scan the decompressed file for embedded files.
-                 * The problem is, we have way of knowing how long embedded files are.
+                 * The problem is, we have no way of knowing how long embedded files are.
                 * We don't know if we have:
                 *
                 * A.       [ data ] [ embedded file ] [ data ] [ embedded file ]
@ -4035,7 +3955,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi

                    switch (fpt->type) {
                        case CL_TYPE_RARSFX:
-                            if (type != CL_TYPE_RAR) {
+                            if ((have_rar && SCAN_PARSE_ARCHIVE && (DCONF_ARCH & ARCH_CONF_RAR)) &&
+                                (type != CL_TYPE_RAR)) {
+                                // TODO: Add header validity check to prevent false positives from being scanned.
                                nret = cli_magic_scan_nested_fmap_type(
                                    ctx->fmap,
                                    fpt->offset,
@ -4048,7 +3970,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi
                            break;

                        case CL_TYPE_EGGSFX:
-                            if (type != CL_TYPE_EGG) {
+                            if ((SCAN_PARSE_ARCHIVE && (DCONF_ARCH & ARCH_CONF_EGG)) &&
+                                (type != CL_TYPE_EGG)) {
+                                // TODO: Add header validity check to prevent false positives from being scanned.
                                nret = cli_magic_scan_nested_fmap_type(
                                    ctx->fmap,
                                    fpt->offset,
@ -4061,11 +3985,26 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi
                            break;

                        case CL_TYPE_ZIPSFX:
-                            if (type != CL_TYPE_ZIP) {
+                            if ((SCAN_PARSE_ARCHIVE && (DCONF_ARCH & ARCH_CONF_ZIP)) &&
+                                (type != CL_TYPE_ZIP) &&
+                                /* OOXML are ZIP-based. */
+                                (type != CL_TYPE_OOXML_WORD) &&
+                                (type != CL_TYPE_OOXML_PPT) &&
+                                (type != CL_TYPE_OOXML_XL) &&
+                                (type != CL_TYPE_OOXML_HWP)) {
+                                // Header validity check to prevent false positives from being scanned.
+                                size_t zip_size = 0;
+
+                                ret = cli_unzip_single_header_check(ctx, fpt->offset, &zip_size);
+                                if (ret != CL_SUCCESS) {
+                                    cli_dbgmsg("ZIP single header check failed: %s (%d)\n", cl_strerror(ret), ret);
+                                    break;
+                                }
+
                                nret = cli_magic_scan_nested_fmap_type(
                                    ctx->fmap,
                                    fpt->offset,
-                                    ctx->fmap->len - fpt->offset,
+                                    zip_size,
                                    ctx,
                                    CL_TYPE_ZIP,
                                    NULL,
@ -4074,11 +4013,20 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi
                            break;

                        case CL_TYPE_CABSFX:
-                            if (type != CL_TYPE_MSCAB) {
+                            if ((SCAN_PARSE_ARCHIVE && (DCONF_ARCH & ARCH_CONF_CAB)) &&
+                                (type != CL_TYPE_MSCAB)) {
+                                // Header validity check to prevent false positives from being scanned.
+                                size_t cab_size = 0;
+                                ret             = cli_mscab_header_check(ctx, fpt->offset, &cab_size);
+                                if (ret != CL_SUCCESS) {
+                                    cli_dbgmsg("CAB header check failed: %s (%d)\n", cl_strerror(ret), ret);
+                                    break;
+                                }
+
                                nret = cli_magic_scan_nested_fmap_type(
                                    ctx->fmap,
                                    fpt->offset,
-                                    ctx->fmap->len - fpt->offset,
+                                    cab_size,
                                    ctx,
                                    CL_TYPE_MSCAB,
                                    NULL,
@ -4087,11 +4035,21 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi
                            break;

                        case CL_TYPE_ARJSFX:
-                            if (type != CL_TYPE_ARJ) {
+                            if ((SCAN_PARSE_ARCHIVE && (DCONF_ARCH & ARCH_CONF_ARJ)) &&
+                                (type != CL_TYPE_ARJ)) {
+                                // Header validity check to prevent false positives from being scanned.
+                                size_t arj_size = 0;
+
+                                ret = cli_unarj_header_check(ctx, fpt->offset, &arj_size);
+                                if (ret != CL_SUCCESS) {
+                                    cli_dbgmsg("ARJ header check failed: %s (%d)\n", cl_strerror(ret), ret);
+                                    break;
+                                }
+
                                nret = cli_magic_scan_nested_fmap_type(
                                    ctx->fmap,
                                    fpt->offset,
-                                    ctx->fmap->len - fpt->offset,
+                                    arj_size,
                                    ctx,
                                    CL_TYPE_ARJ,
                                    NULL,
@ -4100,7 +4058,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi
                            break;

                        case CL_TYPE_7ZSFX:
-                            if (type != CL_TYPE_7Z) {
+                            if ((SCAN_PARSE_ARCHIVE && (DCONF_ARCH & ARCH_CONF_7Z)) &&
+                                (type != CL_TYPE_7Z)) {
+                                // TODO: Add header validity check to prevent false positives from being scanned.
                                nret = cli_magic_scan_nested_fmap_type(
                                    ctx->fmap,
                                    fpt->offset,
@ -4113,8 +4073,10 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi
                            break;

                        case CL_TYPE_NULSFT:
-                            if (type == CL_TYPE_MSEXE && fpt->offset > 4) {
                            // Note: CL_TYPE_NULSFT is special, because the file actually starts 4 bytes before the start of the signature match
+                            if ((SCAN_PARSE_ARCHIVE && (DCONF_ARCH & ARCH_CONF_NSIS)) &&
+                                (type == CL_TYPE_MSEXE && fpt->offset > 4)) {
+                                // TODO: Add header validity check to prevent false positives from being scanned.
                                nret = cli_magic_scan_nested_fmap_type(
                                    ctx->fmap,
                                    fpt->offset - 4,
@ -4127,7 +4089,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi
                            break;

                        case CL_TYPE_AUTOIT:
-                            if (type == CL_TYPE_MSEXE) {
+                            if ((SCAN_PARSE_ARCHIVE && (DCONF_ARCH & ARCH_CONF_AUTOIT)) &&
+                                (type == CL_TYPE_MSEXE)) {
+                                // TODO: Add header validity check to prevent false positives from being scanned.
                                nret = cli_magic_scan_nested_fmap_type(
                                    ctx->fmap,
                                    fpt->offset,
@ -4140,7 +4104,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi
                            break;

                        case CL_TYPE_ISHIELD_MSI:
-                            if (type == CL_TYPE_MSEXE) {
+                            if ((SCAN_PARSE_ARCHIVE && (DCONF_ARCH & ARCH_CONF_ISHIELD)) &&
+                                (type == CL_TYPE_MSEXE)) {
+                                // TODO: Add header validity check to prevent false positives from being scanned.
                                nret = cli_magic_scan_nested_fmap_type(
                                    ctx->fmap,
                                    fpt->offset,
@ -4153,7 +4119,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi
                            break;

                        case CL_TYPE_PDF:
-                            if (type != CL_TYPE_PDF) {
+                            if ((SCAN_PARSE_PDF && (DCONF_DOC & DOC_CONF_PDF)) &&
+                                (type != CL_TYPE_PDF)) {
+                                // TODO: Add header validity check to prevent false positives from being scanned.
                                nret = cli_magic_scan_nested_fmap_type(
                                    ctx->fmap,
                                    fpt->offset,
@ -4166,23 +4134,48 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi
                            break;

                        case CL_TYPE_MSEXE:
-                            if (type == CL_TYPE_MSEXE || type == CL_TYPE_ZIP || type == CL_TYPE_MSOLE2) {
-
-                                cli_dbgmsg("*** Detected embedded PE file at %u ***\n", (unsigned int)fpt->offset);
+                            if (SCAN_PARSE_PE && ctx->dconf->pe &&
+                                (type == CL_TYPE_MSEXE || type == CL_TYPE_ZIP || type == CL_TYPE_MSOLE2)) {
+                                struct cli_exe_info peinfo;

                                if ((uint64_t)(ctx->fmap->len - fpt->offset) > ctx->engine->maxembeddedpe) {
                                    cli_dbgmsg("scanraw: MaxEmbeddedPE exceeded\n");
                                    break;
                                }

+                                cli_exe_info_init(&peinfo, 0);
+
+                                // Header validity check to prevent false positives from being scanned.
+                                ret = cli_peheader(ctx->fmap, &peinfo, CLI_PEHEADER_OPT_NONE, NULL);
+
+                                // peinfo memory may have been allocated and must be freed even if it failed.
+                                cli_exe_info_destroy(&peinfo);
+
+                                if (CL_SUCCESS != ret) {
+                                    cli_dbgmsg("Header check for MSEXE detection failed, probably not actually an embedded PE file.\n");
+                                    break;
+                                }
+
+                                cli_dbgmsg("*** Detected embedded PE file at %u ***\n", (unsigned int)fpt->offset);
+
+                                // Setting ctx->corrupted_input will prevent the PE parser from reporting "broken executable" for unpacked/reconstructed files that may not be 100% to spec.
+                                // In here we're just carrying the corrupted_input flag from parent to child, in case the parent's flag was set.
+                                unsigned int corrupted_input = ctx->corrupted_input;
+
+                                ctx->corrupted_input = 1;
+
                                nret = cli_magic_scan_nested_fmap_type(
                                    ctx->fmap,
                                    fpt->offset,
+                                    // Sadly, there is no way from the PE header to determine the length of the PE file.
+                                    // So we just pass the remaining length of the fmap.
                                    ctx->fmap->len - fpt->offset,
                                    ctx,
                                    CL_TYPE_MSEXE,
                                    NULL,
                                    LAYER_ATTRIBUTES_EMBEDDED);
+
+                                ctx->corrupted_input = corrupted_input;
                            }
                            break;

@ -4789,6 +4782,8 @@ cl_error_t cli_magic_scan(cli_ctx *ctx, cli_file_t type)
         * If self protection mechanism enabled, do the scanraw() scan first
         * before extracting with a file type parser.
         */
+        cli_dbgmsg("cli_magic_scan: Performing raw scan to pattern match\n");
+
        ret = scanraw(ctx, type, 0, &dettype);

        // Evaluate the result from the scan to see if it end the scan of this layer early,
@ -5252,6 +5247,8 @@ cl_error_t cli_magic_scan(cli_ctx *ctx, cli_file_t type)
        (type != CL_TYPE_HTML || !(SCAN_PARSE_HTML) || !(DCONF_DOC & DOC_CONF_HTML_SKIPRAW)) &&
        (!ctx->engine->sdb)) {

+        cli_dbgmsg("cli_magic_scan: Performing raw scan to pattern match and/or detect embedded files\n");
+
        ret = scanraw(ctx, type, typercg, &dettype);

        // Evaluate the result from the scan to see if it end the scan of this layer early,
@ -5290,57 +5287,12 @@ cl_error_t cli_magic_scan(cli_ctx *ctx, cli_file_t type)
        case CL_TYPE_MSEXE:
            perf_nested_start(ctx, PERFT_PE, PERFT_SCAN);
            if (SCAN_PARSE_PE && ctx->dconf->pe) {
-                if (ctx->recursion_stack[ctx->recursion_level].attributes & LAYER_ATTRIBUTES_EMBEDDED) {
-                    /*
-                     * Embedded PE files are PE files that were found within another file using file-type scanning in scanraw()
-                     * They are parsed differently than normal PE files.
-                     */
-                    struct cli_exe_info peinfo;
-
-                    cli_exe_info_init(&peinfo, 0);
-
-                    // TODO We could probably substitute in a quicker
-                    // method of determining whether a PE file exists
-                    // at this offset.
-                    if (cli_peheader(ctx->fmap, &peinfo, CLI_PEHEADER_OPT_NONE, NULL) != 0) {
-                        cli_dbgmsg("Header check for MSEXE detection failed, probably not actually an embedded PE file.\n");
-
-                        /* Despite failing, peinfo memory may have been allocated and must be freed. */
-                        cli_exe_info_destroy(&peinfo);
-
-                    } else {
-                        /* Immediately free up peinfo allocated memory, prior to any recursion */
-                        cli_exe_info_destroy(&peinfo);
-
-                        ret = cli_scanembpe(ctx, 0);
-
-                        // TODO This method of embedded PE extraction
-                        // is kinda gross in that:
-                        //   - if you have an executable that contains
-                        //     20 other exes, the bytes associated with
-                        //     the last exe will have been included in
-                        //     hash computations and things 20 times
-                        //     (as overlay data to the previously
-                        //     extracted exes).
-                        //   - if you have a signed embedded exe, it
-                        //     will fail to validate after extraction
-                        //     bc it has overlay data, which is a
-                        //     violation of the Authenticode spec.
-                        //   - this method of extraction is subject to
-                        //     the recursion limit, which is fairly low.
-                        //
-                        // It'd be awesome if we could compute the PE
-                        // size from the PE header and just extract
-                        // that.
-                    }
-                } else {
                // Setting ctx->corrupted_input will prevent the PE parser from reporting "broken executable" for unpacked/reconstructed files that may not be 100% to spec.
                // In here we're just carrying the corrupted_input flag from parent to child, in case the parent's flag was set.
                unsigned int corrupted_input = ctx->corrupted_input;
                ret                          = cli_scanpe(ctx);
                ctx->corrupted_input         = corrupted_input;
            }
-            }
            perf_nested_stop(ctx, PERFT_PE, PERFT_SCAN);
            break;

@ -5364,8 +5316,9 @@ cl_error_t cli_magic_scan(cli_ctx *ctx, cli_file_t type)
            break;

        case CL_TYPE_PDF: /* FIXMELIMITS: pdf should be an archive! */
-            if (SCAN_PARSE_PDF && (DCONF_DOC & DOC_CONF_PDF))
+            if (SCAN_PARSE_PDF && (DCONF_DOC & DOC_CONF_PDF)) {
                ret = cli_scanpdf(ctx, 0);
+            }
            break;

        default:
--- a/libclamav/unarj.c
+++ b/libclamav/unarj.c
@ -56,10 +56,6 @@
 #define CHAR_BIT (8)
 #endif
 #define MAXMATCH 256
-#ifndef FALSE
-#define FALSE (0)
-#define TRUE (1)
-#endif

 #define CODE_BIT 16
 #define NT (CODE_BIT + 3)
@ -814,23 +810,25 @@ static cl_error_t arj_unstore(arj_metadata_t *metadata, int ofd, uint32_t len)
    return CL_SUCCESS;
 }

-static int is_arj_archive(arj_metadata_t *metadata)
+static bool is_arj_archive(arj_metadata_t *metadata)
 {
    const char header_id[2] = {0x60, 0xea};
    const char *mark;

    mark = fmap_need_off_once(metadata->map, metadata->offset, 2);
-    if (!mark)
-        return FALSE;
+    if (!mark) {
+        cli_dbgmsg("is_arj_archive: Failed to read the two-byte ARJ header ID at offset %zu\n", metadata->offset);
+        return false;
+    }
    metadata->offset += 2;
    if (memcmp(&mark[0], &header_id[0], 2) == 0) {
-        return TRUE;
+        return true;
    }
-    cli_dbgmsg("Not an ARJ archive\n");
-    return FALSE;
+    cli_dbgmsg("is_arj_archive: The two-byte ARJ header ID did not match; This is not an ARJ archive\n");
+    return false;
 }

-static int arj_read_main_header(arj_metadata_t *metadata)
+static bool arj_read_main_header(arj_metadata_t *metadata)
 {
    uint16_t header_size, count;
    arj_main_hdr_t main_hdr;
@ -839,7 +837,7 @@ static int arj_read_main_header(arj_metadata_t *metadata)
    struct text_norm_state fnstate, comstate;
    unsigned char *fnnorm  = NULL;
    unsigned char *comnorm = NULL;
-    uint32_t ret           = TRUE;
+    bool ret               = true;

    size_t filename_max_len = 0;
    size_t filename_len     = 0;
@ -848,28 +846,28 @@ static int arj_read_main_header(arj_metadata_t *metadata)
    size_t orig_offset      = metadata->offset;

    if (fmap_readn(metadata->map, &header_size, metadata->offset, 2) != 2)
-        return FALSE;
+        return false;

    metadata->offset += 2;
    header_size = le16_to_host(header_size);
    cli_dbgmsg("Header Size: %d\n", header_size);
    if (header_size == 0) {
        /* End of archive */
-        ret = FALSE;
+        ret = false;
        goto done;
    }
    if (header_size > HEADERSIZE_MAX) {
        cli_dbgmsg("arj_read_header: invalid header_size: %u\n", header_size);
-        ret = FALSE;
+        ret = false;
        goto done;
    }
    if ((header_size + sizeof(header_size)) > (metadata->map->len - metadata->offset)) {
        cli_dbgmsg("arj_read_header: invalid header_size: %u, exceeds length of file.\n", header_size);
-        ret = FALSE;
+        ret = false;
        goto done;
    }
    if (fmap_readn(metadata->map, &main_hdr, metadata->offset, 30) != 30) {
-        ret = FALSE;
+        ret = false;
        goto done;
    }
    metadata->offset += 30;
@ -885,7 +883,7 @@ static int arj_read_main_header(arj_metadata_t *metadata)

    if (main_hdr.first_hdr_size < 30) {
        cli_dbgmsg("Format error. First Header Size < 30\n");
-        ret = FALSE;
+        ret = false;
        goto done;
    }
    if (main_hdr.first_hdr_size > 30) {
@ -895,7 +893,7 @@ static int arj_read_main_header(arj_metadata_t *metadata)
    filename_max_len = (header_size + sizeof(header_size)) - (metadata->offset - orig_offset);
    if (filename_max_len > header_size) {
        cli_dbgmsg("UNARJ: Format error. First Header Size invalid\n");
-        ret = FALSE;
+        ret = false;
        goto done;
    }
    if (filename_max_len > 0) {
@ -903,7 +901,7 @@ static int arj_read_main_header(arj_metadata_t *metadata)
        filename = fmap_need_offstr(metadata->map, metadata->offset, filename_max_len + 1);
        if (!filename || !fnnorm) {
            cli_dbgmsg("UNARJ: Unable to allocate memory for filename\n");
-            ret = FALSE;
+            ret = false;
            goto done;
        }
        filename_len = CLI_STRNLEN(filename, filename_max_len);
@ -913,7 +911,7 @@ static int arj_read_main_header(arj_metadata_t *metadata)
    comment_max_len = (header_size + sizeof(header_size)) - (metadata->offset - orig_offset);
    if (comment_max_len > header_size) {
        cli_dbgmsg("UNARJ: Format error. First Header Size invalid\n");
-        ret = FALSE;
+        ret = false;
        goto done;
    }
    if (comment_max_len > 0) {
@ -921,7 +919,7 @@ static int arj_read_main_header(arj_metadata_t *metadata)
        comment = fmap_need_offstr(metadata->map, metadata->offset, comment_max_len + 1);
        if (!comment || !comnorm) {
            cli_dbgmsg("UNARJ: Unable to allocate memory for comment\n");
-            ret = FALSE;
+            ret = false;
            goto done;
        }
        comment_len = CLI_STRNLEN(comment, comment_max_len);
@ -942,7 +940,7 @@ static int arj_read_main_header(arj_metadata_t *metadata)
    for (;;) {
        const uint16_t *countp = fmap_need_off_once(metadata->map, metadata->offset, 2);
        if (!countp) {
-            ret = FALSE;
+            ret = false;
            goto done;
        }
        count = cli_readint16(countp);
@ -1118,7 +1116,7 @@ static cl_error_t arj_read_file_header(arj_metadata_t *metadata)
    metadata->comp_size = file_hdr.comp_size;
    metadata->orig_size = file_hdr.orig_size;
    metadata->method    = file_hdr.method;
-    metadata->encrypted = ((file_hdr.flags & GARBLE_FLAG) != 0) ? TRUE : FALSE;
+    metadata->encrypted = ((file_hdr.flags & GARBLE_FLAG) != 0) ? true : false;
    metadata->ofd       = -1;
    if (!metadata->filename) {
        ret = CL_EMEM;
@ -1146,27 +1144,112 @@ cl_error_t cli_unarj_open(fmap_t *map, const char *dirname, arj_metadata_t *meta
    metadata->map    = map;
    metadata->offset = 0;
    if (!is_arj_archive(metadata)) {
-        cli_dbgmsg("Not in ARJ format\n");
+        cli_dbgmsg("cli_unarj_open: is_arj_archive check failed\n");
        return CL_EFORMAT;
    }
    if (!arj_read_main_header(metadata)) {
-        cli_dbgmsg("Failed to read main header\n");
+        cli_dbgmsg("cli_unarj_open: Failed to read main header\n");
        return CL_EFORMAT;
    }
    return CL_SUCCESS;
 }

-cl_error_t cli_unarj_prepare_file(const char *dirname, arj_metadata_t *metadata)
+cl_error_t cli_unarj_header_check(
+    cli_ctx *ctx,
+    uint32_t offset,
+    size_t *size)
+{
+    cl_error_t status = CL_EFORMAT;
+    bool bool_ret;
+    cl_error_t ret;
+    arj_metadata_t metadata = {0};
+    int files_found         = 0;
+
+    cli_dbgmsg("in cli_unarj_header_check\n");
+
+    if (!ctx || !ctx->fmap || !size) {
+        status = CL_ENULLARG;
+        goto done;
+    }
+
+    metadata.map    = ctx->fmap;
+    metadata.offset = offset;
+    *size           = 0;
+
+    bool_ret = is_arj_archive(&metadata);
+    if (false == bool_ret) {
+        cli_dbgmsg("Not in ARJ format\n");
+        status = CL_EFORMAT;
+        goto done;
+    }
+
+    cli_dbgmsg("cli_unarj_header_check: is_arj_archive-check passed\n");
+
+    bool_ret = arj_read_main_header(&metadata);
+    if (false == bool_ret) {
+        cli_dbgmsg("Failed to read main header\n");
+        status = CL_EFORMAT;
+        goto done;
+    }
+
+    cli_dbgmsg("cli_unarj_header_check: Successfully read main header\n");
+
+    do {
+        metadata.filename  = NULL;
+        metadata.comp_size = 0;
+        metadata.orig_size = 0;
+
+        ret = cli_unarj_prepare_file(&metadata);
+        if (ret == CL_SUCCESS) {
+            cli_dbgmsg("cli_unarj_header_check: Successfully read file header\n");
+            files_found++;
+
+            /* Skip the file data */
+            metadata.offset += metadata.comp_size;
+
+        } else if (ret == CL_BREAK) {
+            cli_dbgmsg("cli_unarj_header_check: End of archive\n");
+            status = CL_BREAK;
+
+        } else {
+            cli_dbgmsg("cli_unarj_header_check: Error reading file header: %s\n", cl_strerror(ret));
+            status = ret;
+        }
+
+        CLI_FREE_AND_SET_NULL(metadata.filename);
+    } while (ret == CL_SUCCESS);
+
+    if (files_found > 0) {
+        /* Successfully found at least one file */
+        status = CL_SUCCESS;
+        *size  = metadata.offset - offset;
+        cli_dbgmsg("cli_unarj_header_check: Successfully found %d files in valid ARJ archive of %zu bytes\n", files_found, *size);
+    } else {
+        status = CL_EFORMAT;
+        cli_dbgmsg("cli_unarj_header_check: No files found; Invalid ARJ archive\n");
+    }
+
+done:
+    CLI_FREE_AND_SET_NULL(metadata.filename);
+
+    return status;
+}
+
+cl_error_t cli_unarj_prepare_file(arj_metadata_t *metadata)
 {
    cli_dbgmsg("in cli_unarj_prepare_file\n");
-    if (!metadata || !dirname) {
+
+    if (NULL == metadata) {
+        cli_dbgmsg("cli_unarj_prepare_file: invalid NULL arguments\n");
        return CL_ENULLARG;
    }
+
    /* Each file is preceded by the ARJ file marker */
    if (!is_arj_archive(metadata)) {
        cli_dbgmsg("Not in ARJ format\n");
        return CL_EFORMAT;
    }
+
    return arj_read_file_header(metadata);
 }

--- a/libclamav/unarj.h
+++ b/libclamav/unarj.h
@ -24,7 +24,10 @@
 #ifndef __UNARJ_H
 #define __UNARJ_H

+#include "clamav.h"
+#include "others.h"
 #include "fmap.h"
+
 typedef struct arj_metadata_tag {
    char *filename;
    uint32_t comp_size;
@ -36,8 +39,20 @@ typedef struct arj_metadata_tag {
    size_t offset;
 } arj_metadata_t;

+/**
+ * @brief Verify ARJ file header and get size of ARJ based on headers.
+ *
+ * Does not extract or scan the file.
+ *
+ * @param[in,out] ctx     Scan context
+ * @param offset          Offset of the file header
+ * @param[out] size       Will be set to the size of the file header + file data.
+ * @return cl_error_t     CL_SUCCESS on success, or an error code on failure.
+ */
+cl_error_t cli_unarj_header_check(cli_ctx *ctx, uint32_t offset, size_t *size);
+
 cl_error_t cli_unarj_open(fmap_t *map, const char *dirname, arj_metadata_t *metadata);
-cl_error_t cli_unarj_prepare_file(const char *dirname, arj_metadata_t *metadata);
+cl_error_t cli_unarj_prepare_file(arj_metadata_t *metadata);
 cl_error_t cli_unarj_extract_file(const char *dirname, arj_metadata_t *metadata);

 #endif
--- a/libclamav/unzip.c
+++ b/libclamav/unzip.c
@ -825,6 +825,48 @@ done:
    return status;
 }

+cl_error_t cli_unzip_single_header_check(
+    cli_ctx *ctx,
+    uint32_t offset,
+    size_t *size)
+{
+    cl_error_t status             = CL_ERROR;
+    struct zip_record file_record = {0};
+    cl_error_t ret;
+
+    ret = parse_local_file_header(
+        ctx,
+        offset,
+        NULL,  /* num_files_unzipped */
+        0,     /* file_count */
+        NULL,  /* central_header */
+        NULL,  /* tmpd */
+        false, /* detect_encrypted */
+        NULL,  /* zcb */
+        &file_record,
+        size);
+    if (ret != CL_SUCCESS) {
+        cli_dbgmsg("cli_unzip: single header check - failed to parse local file header: %s (%d)\n", cl_strerror(ret), ret);
+        status = ret;
+        goto done;
+    }
+
+    if (file_record.compressed_size == 0 || file_record.uncompressed_size == 0) {
+        cli_dbgmsg("cli_unzip: single header check - empty file\n");
+        status = CL_EFORMAT;
+        goto done;
+    }
+
+    status = CL_SUCCESS;
+
+done:
+    if (file_record.original_filename) {
+        free(file_record.original_filename);
+    }
+
+    return status;
+}
+
 /**
 * @brief Parse, extract, and scan a file by iterating the central directory.
 *
--- a/libclamav/unzip.h
+++ b/libclamav/unzip.h
@ -119,6 +119,18 @@ cl_error_t cli_unzip(cli_ctx *ctx);
 */
 cl_error_t cli_unzip_single(cli_ctx *ctx, size_t local_header_offset);

+/**
+ * @brief Verify a single local file header.
+ *
+ * Does not extract or scan the file.
+ *
+ * @param[in,out] ctx           Scan context
+ * @param offset                Offset of the local file header
+ * @param[out] size             Will be set to the size of the file header + file data.
+ * @return cl_error_t           CL_SUCCESS on success, or an error code on failure.
+ */
+cl_error_t cli_unzip_single_header_check(cli_ctx *ctx, uint32_t offset, size_t *size);
+
 /**
 * @brief Unzip a single file from a zip archive.
 *
--- a/unit_tests/clamscan/embedded_files_test.py
+++ b/unit_tests/clamscan/embedded_files_test.py
@ -0,0 +1,130 @@
+# Copyright (C) 2020-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
+
+"""
+Run clamscan tests.
+"""
+
+import sys
+from zipfile import ZIP_DEFLATED, ZipFile
+
+sys.path.append('../unit_tests')
+import testcase
+
+
+class TC(testcase.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super(TC, cls).setUpClass()
+
+    @classmethod
+    def tearDownClass(cls):
+        super(TC, cls).tearDownClass()
+
+    def setUp(self):
+        super(TC, self).setUp()
+
+    def tearDown(self):
+        super(TC, self).tearDown()
+        self.verify_valgrind_log()
+
+    def test_embedded_zips(self):
+        self.step_name('Test that clamav can successfully extract and alert on multiple embedded ZIP files')
+
+        path_db = TC.path_source / 'unit_tests' / 'input' / 'embedded_testfiles' / 'signatures'
+        testfiles = TC.path_source / 'unit_tests' / 'input' / 'embedded_testfiles' / 'test.png.emb-zips'
+
+        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles} --gen-json --debug --allmatch'.format(
+            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
+            path_db=path_db,
+            testfiles=testfiles,
+        )
+        output = self.execute_command(command)
+
+        assert output.ec == 1  # no virus, no failures
+
+        expected_stdout = [
+            'test.png.emb-zips: test-file-1-1.UNOFFICIAL FOUND',
+            'test.png.emb-zips: test-file-1-2.UNOFFICIAL FOUND',
+            'test.png.emb-zips: test-file-2-1.UNOFFICIAL FOUND',
+            'test.png.emb-zips: test-file-2-2.UNOFFICIAL FOUND',
+        ]
+        unexpected_stdout = [
+            'OK',
+        ]
+        self.verify_output(output.out, expected=expected_stdout, unexpected=unexpected_stdout)
+
+    def test_embedded_arjs(self):
+        self.step_name('Test that clamav can successfully extract and alert on multiple embedded ARJ files')
+
+        path_db = TC.path_source / 'unit_tests' / 'input' / 'embedded_testfiles' / 'signatures'
+        testfiles = TC.path_source / 'unit_tests' / 'input' / 'embedded_testfiles' / 'test.png.emb-arjs'
+
+        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles} --gen-json --debug --allmatch'.format(
+            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
+            path_db=path_db,
+            testfiles=testfiles,
+        )
+        output = self.execute_command(command)
+
+        assert output.ec == 1  # no virus, no failures
+
+        expected_stdout = [
+            'test.png.emb-arjs: test-file-1-1.UNOFFICIAL FOUND',
+            'test.png.emb-arjs: test-file-1-2.UNOFFICIAL FOUND',
+            'test.png.emb-arjs: test-file-2-1.UNOFFICIAL FOUND',
+            'test.png.emb-arjs: test-file-2-2.UNOFFICIAL FOUND',
+        ]
+        unexpected_stdout = [
+            'OK',
+        ]
+        self.verify_output(output.out, expected=expected_stdout, unexpected=unexpected_stdout)
+
+    def test_embedded_cabs(self):
+        self.step_name('Test that clamav can successfully extract and alert on multiple embedded CAB files')
+
+        path_db = TC.path_source / 'unit_tests' / 'input' / 'embedded_testfiles' / 'signatures'
+        testfiles = TC.path_source / 'unit_tests' / 'input' / 'embedded_testfiles' / 'test.png.emb-cabs'
+
+        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles} --gen-json --debug --allmatch'.format(
+            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
+            path_db=path_db,
+            testfiles=testfiles,
+        )
+        output = self.execute_command(command)
+
+        assert output.ec == 1  # no virus, no failures
+
+        expected_stdout = [
+            'test.png.emb-cabs: test-file-1-1.UNOFFICIAL FOUND',
+            'test.png.emb-cabs: test-file-1-2.UNOFFICIAL FOUND',
+            'test.png.emb-cabs: test-file-2-1.UNOFFICIAL FOUND',
+            'test.png.emb-cabs: test-file-2-2.UNOFFICIAL FOUND',
+        ]
+        unexpected_stdout = [
+            'OK',
+        ]
+        self.verify_output(output.out, expected=expected_stdout, unexpected=unexpected_stdout)
+
+    def test_embedded_exes(self):
+        self.step_name('Test that clamav can successfully extract and alert on multiple embedded EXE files')
+
+        path_db = TC.path_source / 'unit_tests' / 'input' / 'embedded_testfiles' / 'signatures'
+        testfiles = TC.path_source / 'unit_tests' / 'input' / 'embedded_testfiles' / 'clam.exe.emb-exes'
+
+        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles} --gen-json --debug --allmatch'.format(
+            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
+            path_db=path_db,
+            testfiles=testfiles,
+        )
+        output = self.execute_command(command)
+
+        assert output.ec == 1  # no virus, no failures
+
+        expected_stdout = [
+            'clam.exe.emb-exes: Win.Test.LilEXE.UNOFFICIAL FOUND',
+            'clam.exe.emb-exes: Win.Test.SmolEXE.UNOFFICIAL FOUND',
+        ]
+        unexpected_stdout = [
+            'OK',
+        ]
+        self.verify_output(output.out, expected=expected_stdout, unexpected=unexpected_stdout)
--- a/unit_tests/input/embedded_testfiles/clam.exe.emb-exes
+++ b/unit_tests/input/embedded_testfiles/clam.exe.emb-exes
--- a/unit_tests/input/embedded_testfiles/emb/1/test-file-2.ref
+++ b/unit_tests/input/embedded_testfiles/emb/1/test-file-2.ref
--- a/unit_tests/input/embedded_testfiles/emb/1/test-file.ref
+++ b/unit_tests/input/embedded_testfiles/emb/1/test-file.ref
--- a/unit_tests/input/embedded_testfiles/emb/2/test-file-2.ref
+++ b/unit_tests/input/embedded_testfiles/emb/2/test-file-2.ref
--- a/unit_tests/input/embedded_testfiles/emb/2/test-file.ref
+++ b/unit_tests/input/embedded_testfiles/emb/2/test-file.ref
--- a/unit_tests/input/embedded_testfiles/emb/smol_exe/Cargo.lock
+++ b/unit_tests/input/embedded_testfiles/emb/smol_exe/Cargo.lock
@ -0,0 +1,7 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "t1"
+version = "0.1.0"
--- a/unit_tests/input/embedded_testfiles/emb/smol_exe/Cargo.toml
+++ b/unit_tests/input/embedded_testfiles/emb/smol_exe/Cargo.toml
@ -0,0 +1,13 @@
+[package]
+name = "t1"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+
+[profile.release]
+strip = true
+opt-level = "z"
+lto = true
+codegen-units = 1
+panic = "abort"
--- a/unit_tests/input/embedded_testfiles/emb/smol_exe/src/main.rs
+++ b/unit_tests/input/embedded_testfiles/emb/smol_exe/src/main.rs
@ -0,0 +1,37 @@
+#![no_std]
+#![no_main]
+
+use core::arch::asm;
+
+///
+/// This is basically a Windows port of the example from: <https://darkcoding.net/software/a-very-small-rust-binary-indeed/>
+/// With one minor change to print a message instead of only exiting with a code.
+/// Thank you to the author.
+///
+/// Build with:
+/// ```powershell
+/// $env:RUSTFLAGS="-Ctarget-cpu=native -Clink-args=/ENTRY:_start -Clink-args=/SUBSYSTEM:CONSOLE -Clink-args=/LARGEADDRESSAWARE:NO -Clink-args=ucrt.lib -Crelocation-model=static -Clink-args=-Wl,-n,-N,--no-dynamic-linker,--no-pie,--build-id=none,--no-eh-frame-hdr"
+/// cargo +nightly build  -Z build-std=std,panic_abort -Z build-std-features="optimize_for_size" --target x86_64-pc-windows-msvc  --release
+/// ```
+///
+
+#[unsafe(no_mangle)]
+pub extern "C" fn _start() -> ! {
+    let s = b"Lil EXE\n\0";
+    unsafe {
+        asm!(
+            "mov rcx, {0}",
+            "call puts",
+            in(reg) s.as_ptr(),
+            options(nostack, noreturn)
+        )
+        // nostack prevents `asm!` from push/pop rax
+        // noreturn prevents it putting a 'ret' at the end
+        //  but it does put a ud2 (undefined instruction) instead
+    }
+}
+
+#[panic_handler]
+fn my_panic(_info: &core::panic::PanicInfo) -> ! {
+    loop {}
+}
--- a/unit_tests/input/embedded_testfiles/signatures/1.1.hsb
+++ b/unit_tests/input/embedded_testfiles/signatures/1.1.hsb
@ -0,0 +1 @@
+579de681add9f8c686fa791c49d1222a63c236febff37769b5fb50659b007491:16:test-file-1-1
--- a/unit_tests/input/embedded_testfiles/signatures/1.2.hsb
+++ b/unit_tests/input/embedded_testfiles/signatures/1.2.hsb
@ -0,0 +1 @@
+b56f04ceb6cadbc0f50f9acfadbedc81257a6af21f6212ef57a70a599fc8bf38:16:test-file-1-2
--- a/unit_tests/input/embedded_testfiles/signatures/2.1.hsb
+++ b/unit_tests/input/embedded_testfiles/signatures/2.1.hsb
@ -0,0 +1 @@
+1cce5c6d7f11469ffa6153481b7d6275534ce7c62bc34f12f7d742c5e6cf026b:24:test-file-2-1
--- a/unit_tests/input/embedded_testfiles/signatures/2.2.hsb
+++ b/unit_tests/input/embedded_testfiles/signatures/2.2.hsb
@ -0,0 +1 @@
+6d1c6cae0a30435b52d362544bea666492d06173ded04504bf30f369abfadd50:27:test-file-2-2
--- a/unit_tests/input/embedded_testfiles/signatures/lil.exe.ldb
+++ b/unit_tests/input/embedded_testfiles/signatures/lil.exe.ldb
@ -0,0 +1,2 @@
+# Match on "Lil EXE\n"
+Win.Test.LilEXE;Engine:90-255,Target:1;0;1552:4c696c204558450a
--- a/unit_tests/input/embedded_testfiles/signatures/smol.exe.ldb
+++ b/unit_tests/input/embedded_testfiles/signatures/smol.exe.ldb
@ -0,0 +1,2 @@
+# Match on "Smol EXE\n"
+Win.Test.SmolEXE;Engine:90-255,Target:1;0;1552:536d6f6c204558450a
--- a/unit_tests/input/embedded_testfiles/test.png.emb-arjs
+++ b/unit_tests/input/embedded_testfiles/test.png.emb-arjs
--- a/unit_tests/input/embedded_testfiles/test.png.emb-cabs
+++ b/unit_tests/input/embedded_testfiles/test.png.emb-cabs
--- a/unit_tests/input/embedded_testfiles/test.png.emb-zips
+++ b/unit_tests/input/embedded_testfiles/test.png.emb-zips
				`@ -0,0 +1 @@`
				`579de681add9f8c686fa791c49d1222a63c236febff37769b5fb50659b007491:16:test-file-1-1`
				`@ -0,0 +1 @@`
				`b56f04ceb6cadbc0f50f9acfadbedc81257a6af21f6212ef57a70a599fc8bf38:16:test-file-1-2`
				`@ -0,0 +1 @@`
				`1cce5c6d7f11469ffa6153481b7d6275534ce7c62bc34f12f7d742c5e6cf026b:24:test-file-2-1`
				`@ -0,0 +1 @@`
				`6d1c6cae0a30435b52d362544bea666492d06173ded04504bf30f369abfadd50:27:test-file-2-2`