diff --git a/libclamav/matcher-ac.c b/libclamav/matcher-ac.c index 77cc331c7..7f05eabb1 100644 --- a/libclamav/matcher-ac.c +++ b/libclamav/matcher-ac.c @@ -833,8 +833,9 @@ int cli_ac_chklsig(const char *expr, const char *end, uint32_t *lsigcnt, unsigne return -1; } - for (i += 2; i + 1 < len && (isdigit(expr[i + 1]) || expr[i + 1] == ','); i++) - ; + for (i += 2; i + 1 < len && (isdigit(expr[i + 1]) || expr[i + 1] == ','); i++) { + continue; + } } if (&expr[i + 1] == rend) @@ -1625,19 +1626,23 @@ void cli_ac_freedata(struct cli_ac_data *data) } } -/* returns only CL_SUCCESS or CL_EMEM */ -inline static int ac_addtype(struct cli_matched_type **list, cli_file_t type, off_t offset, const cli_ctx *ctx) +/** + * @brief Add a match for an object type to the list of matched types. + * + * Important: The caller is responsible for checking limits! + * + * @param list Pointer to the list of matched types. *list may be NULL if no types have been added yet. + * @param type The type of the embedded object. + * @param offset The offset of the embedded object. + * @param ctx The context information. May be NULL. + * @return cl_error_t CL_SUCCESS regardless if added, or CL_EMEM if memory allocation failed. + */ +inline static cl_error_t ac_addtype(struct cli_matched_type **list, cli_file_t type, off_t offset, const cli_ctx *ctx) { - struct cli_matched_type *tnode, *tnode_last; + struct cli_matched_type *tnode; - if (type == CL_TYPE_ZIPSFX) { - if (*list && ctx && ctx->engine->maxfiles && (*list)->cnt > ctx->engine->maxfiles) - return CL_SUCCESS; - } else if (*list && (*list)->cnt >= MAX_EMBEDDED_OBJ) { - return CL_SUCCESS; - } - - if (!(tnode = calloc(1, sizeof(struct cli_matched_type)))) { + tnode = calloc(1, sizeof(struct cli_matched_type)); + if (NULL == tnode) { cli_errmsg("cli_ac_addtype: Can't allocate memory for new type node\n"); return CL_EMEM; } @@ -1645,16 +1650,25 @@ inline static int ac_addtype(struct cli_matched_type **list, cli_file_t type, of tnode->type = type; tnode->offset = offset; - tnode_last = *list; - while (tnode_last && tnode_last->next) - tnode_last = tnode_last->next; + if (*list) { + // Add to end of existing list. + struct cli_matched_type *tnode_last = *list; - if (tnode_last) + while (tnode_last && tnode_last->next) { + tnode_last = tnode_last->next; + } tnode_last->next = tnode; - else + } else { + // First type in the list. *list = tnode; + } (*list)->cnt++; + + if (UNLIKELY(cli_get_debug_flag())) { + cli_dbgmsg("ac_addtype: added %s embedded object at offset " STDi64 ". Embedded object count: %d\n", cli_ftname(type), (uint64_t)offset, (*list)->cnt); + } + return CL_SUCCESS; } @@ -1999,14 +2013,65 @@ cl_error_t cli_ac_scanbuff( cli_dbgmsg("Matched signature for file type %s\n", pt->virname); type = pt->type; - if ((ftoffset != NULL) && - ((*ftoffset == NULL) || (*ftoffset)->cnt < MAX_EMBEDDED_OBJ || type == CL_TYPE_ZIPSFX) && (type >= CL_TYPE_SFX || ((ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP || ftype == CL_TYPE_MSOLE2) && type == CL_TYPE_MSEXE))) { - /* FIXME: the first offset in the array is most likely the correct one but - * it may happen it is not - */ - for (j = 1; j <= CLI_DEFAULT_AC_TRACKLEN + 1 && offmatrix[0][j] != (uint32_t)-1; j++) - if (ac_addtype(ftoffset, type, offmatrix[pt->parts - 1][j], ctx)) - return CL_EMEM; + + if (ftoffset != NULL) { + // Caller provided a pointer to record matched types. + bool too_many_types = false; + bool supported_type = false; + + if (*ftoffset != NULL) { + // Have some type matches already. Check limits. + + if (ctx && ((type == CL_TYPE_ZIPSFX) || + (type == CL_TYPE_MSEXE && ftype == CL_TYPE_MSEXE))) { + // When ctx present, limit the number of type matches using ctx->engine->maxfiles for specific types. + // Reasoning: + // ZIP local file header entries likely to be numerous if a single ZIP appended to the scanned file. + // MSEXE can contain many embedded MSEXE entries and MSEXE type false positives matches. + + if (ctx->engine->maxfiles == 0) { + // Max-files limit is disabled. + } else if ((*ftoffset)->cnt >= ctx->engine->maxfiles) { + if (UNLIKELY(cli_get_debug_flag())) { + cli_dbgmsg("ac_addtype: Can't add %s type at offset " STDu64 " to list of embedded type matches. Reached maxfiles limit of %u\n", cli_ftname(type), (*ftoffset)->offset, ctx->engine->maxfiles); + } + too_many_types = true; + } + } else { + // Limit the number of type matches using MAX_EMBEDDED_OBJ. + if ((*ftoffset)->cnt >= MAX_EMBEDDED_OBJ) { + if (UNLIKELY(cli_get_debug_flag())) { + cli_dbgmsg("ac_addtype: Can't add %s type at offset " STDu64 " to list of embedded type matches. Reached MAX_EMBEDDED_OBJ limit of %u\n", cli_ftname(type), (*ftoffset)->offset, MAX_EMBEDDED_OBJ); + } + too_many_types = true; + } + } + } + + // Filter to supported types. + if ( + // Found type is MBR. + type == CL_TYPE_MBR || + // Found type is any SFX type (i.e., ZIPSFX, RARSFX, 7ZSSFX, etc.). + type >= CL_TYPE_SFX || + // Found type is an MSEXE, but only if host file type is one of MSEXE, ZIP, or MSOLE2. + (type == CL_TYPE_MSEXE && (ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP || ftype == CL_TYPE_MSOLE2))) { + + supported_type = true; + } + + if (supported_type && !too_many_types) { + /* FIXME: the first offset in the array is most likely the correct one but + * it may happen it is not + * Until we're certain and can fix this, we add all offsets in the list. + */ + for (j = 1; j <= CLI_DEFAULT_AC_TRACKLEN + 1 && offmatrix[0][j] != (uint32_t)-1; j++) { + ret = ac_addtype(ftoffset, type, offmatrix[pt->parts - 1][j], ctx); + if (CL_SUCCESS != ret) { + return ret; + } + } + } } memset(offmatrix[0], (uint32_t)-1, pt->parts * (CLI_DEFAULT_AC_TRACKLEN + 2) * sizeof(uint32_t)); @@ -2066,11 +2131,59 @@ cl_error_t cli_ac_scanbuff( cli_dbgmsg("Matched signature for file type %s at %u\n", pt->virname, realoff); type = pt->type; - if ((ftoffset != NULL) && - ((*ftoffset == NULL) || (*ftoffset)->cnt < MAX_EMBEDDED_OBJ || type == CL_TYPE_ZIPSFX) && (type == CL_TYPE_MBR || type >= CL_TYPE_SFX || ((ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP || ftype == CL_TYPE_MSOLE2) && type == CL_TYPE_MSEXE))) { - if (ac_addtype(ftoffset, type, realoff, ctx)) - return CL_EMEM; + if (ftoffset != NULL) { + // Caller provided a pointer to record matched types. + bool too_many_types = false; + bool supported_type = false; + + if (*ftoffset != NULL) { + // Have some type matches already. Check limits. + + if (ctx && ((type == CL_TYPE_ZIPSFX) || + (type == CL_TYPE_MSEXE && ftype == CL_TYPE_MSEXE))) { + // When ctx present, limit the number of type matches using ctx->engine->maxfiles for specific types. + // Reasoning: + // ZIP local file header entries likely to be numerous if a single ZIP appended to the scanned file. + // MSEXE can contain many embedded MSEXE entries and MSEXE type false positives matches. + + if (ctx->engine->maxfiles == 0) { + // Max-files limit is disabled. + } else if ((*ftoffset)->cnt >= ctx->engine->maxfiles) { + if (UNLIKELY(cli_get_debug_flag())) { + cli_dbgmsg("ac_addtype: Can't add %s type at offset " STDu64 " to list of embedded type matches. Reached maxfiles limit of %u\n", cli_ftname(type), (*ftoffset)->offset, ctx->engine->maxfiles); + } + too_many_types = true; + } + } else { + // Limit the number of type matches using MAX_EMBEDDED_OBJ. + if ((*ftoffset)->cnt >= MAX_EMBEDDED_OBJ) { + if (UNLIKELY(cli_get_debug_flag())) { + cli_dbgmsg("ac_addtype: Can't add %s type at offset " STDu64 " to list of embedded type matches. Reached MAX_EMBEDDED_OBJ limit of %u\n", cli_ftname(type), (*ftoffset)->offset, MAX_EMBEDDED_OBJ); + } + too_many_types = true; + } + } + } + + // Filter to supported types. + if ( + // Found type is MBR. + type == CL_TYPE_MBR || + // Found type is any SFX type (i.e., ZIPSFX, RARSFX, 7ZSSFX, etc.). + type >= CL_TYPE_SFX || + // Found type is an MSEXE, but only if host file type is one of MSEXE, ZIP, or MSOLE2. + (type == CL_TYPE_MSEXE && (ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP || ftype == CL_TYPE_MSOLE2))) { + + supported_type = true; + } + + if (supported_type && !too_many_types) { + ret = ac_addtype(ftoffset, type, realoff, ctx); + if (CL_SUCCESS != ret) { + return ret; + } + } } } } else { diff --git a/libclamav/matcher.h b/libclamav/matcher.h index 32581f0de..310451b46 100644 --- a/libclamav/matcher.h +++ b/libclamav/matcher.h @@ -390,6 +390,6 @@ cl_error_t cli_matchmeta(cli_ctx *ctx, const char *fname, size_t fsizec, size_t * - 9 - MachO * @param ctx The current scan context */ -void cli_targetinfo(struct cli_target_info *info, unsigned int target, cli_ctx *ctx); +void cli_targetinfo(struct cli_target_info *info, cli_target_t target, cli_ctx *ctx); #endif diff --git a/libclamav/scanners.c b/libclamav/scanners.c index 2cb2cca49..980182542 100644 --- a/libclamav/scanners.c +++ b/libclamav/scanners.c @@ -1665,7 +1665,7 @@ static cl_error_t cli_ole2_tempdir_scan_vba_new(const char *dir, cli_ctx *ctx, s goto done; } - ret = cli_scan_desc(tempfd, ctx, CL_TYPE_SCRIPT, false, NULL, AC_SCAN_VIR, NULL, "extracted-vba-project", tempfile, LAYER_ATTRIBUTES_NORMALIZED); + ret = cli_scan_desc(tempfd, ctx, CL_TYPE_SCRIPT, false, NULL, AC_SCAN_VIR, NULL, "extracted-vba-project", tempfile, LAYER_ATTRIBUTES_NONE); if (CL_SUCCESS != ret) { goto done; } @@ -3650,7 +3650,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi // Omit OLD TAR files because it's a raw archive format that we can extract and scan manually. (type != CL_TYPE_OLD_TAR) && // Omit POSIX TAR files because it's a raw archive format that we can extract and scan manually. - (type != CL_TYPE_POSIX_TAR)) { + (type != CL_TYPE_POSIX_TAR) && + // Omit TNEF files because TNEF message attachments are raw / not compressed. Document and ZIP attachments would be likely to have double-extraction issues. + (type != CL_TYPE_TNEF)) { /* * Enable file type recognition scan mode if requested, except for some problematic types (above). */ @@ -3668,6 +3670,8 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi // In allmatch-mode, ret will never be CL_VIRUS, so ret may be used exclusively for file type detection and for terminal errors. // When not in allmatch-mode, it's more important to return right away if ret is CL_VIRUS, so we don't care if file type matches were found. if (ret >= CL_TYPENO) { + size_t last_offset = 0; + // Matched 1+ file type signatures. Handle them. found_type = (cli_file_t)ret; @@ -3676,11 +3680,16 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi fpt = ftoffset; while (fpt) { - if (fpt->offset > 0) { + if ((fpt->offset > 0) && + // Only handle each offset once to prevent duplicate processing like if two signatures are found at the same offset. + ((size_t)fpt->offset > last_offset)) { + bool type_has_been_handled = true; bool ancestor_was_embedded = false; size_t i; + last_offset = (size_t)fpt->offset; + /* * First, use "embedded type recognition" to identify a file's actual type. * (a.k.a. not embedded files, but file type detection corrections) @@ -3871,84 +3880,10 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi break; } - /* - * Only scan embedded files if we are not already in an embedded context. - * That is, if this or a previous layer was identified with embedded file type recognition, then we do - * not scan for embedded files again. - * - * This restriction will prevent detecting the same embedded content more than once when recursing with - * embedded file type recognition deeper within the same buffer. - * - * This is necessary because we have no way of knowing the length of a file for many formats and cannot - * prevent a search for embedded files from finding the same embedded content multiple times (like a LOT - * of times). - * - * E.g. if the file is like this: - * - * [ data ] [ embedded file ] [ data ] [ embedded file ] - * - * The first time we do it we'll find "two" embedded files, like this: - * - * Emb. File #1: [ embedded file ] [ data ] [ embedded file ] - * Emb. File #2: [ embedded file ] - * - * We must not scan Emb. File #1 again for embedded files, because it would double-extract Emb. File #2. - * - * There is a flaw in this logic, though. Suppose that we actually have: - * - * [ data ] [ compressed file w. recognizable magic bytes ] - * - * A first pass of the above will again identify "two" embedded files: - * - * Emb. File #1: [ compressed archive w. recognizable magic bytes ] - * Emb. File #2: [ magic bytes ] <- Compressed data/Not real file - * - * In this case, the magic bytes of a contained, compressed file is somehow still identifiable despite - * compression. The result is the Emb. File #2 will fail to be parsed and when we decompress Emb. File - * #1, then we maybe get something like this: - * - * Decompressed: [ data ] [ embedded file ] - * - * So if this happened... then we WOULD want to scan the decompressed file for embedded files. - * The problem is, we have no way of knowing how long embedded files are. - * We don't know if we have: - * - * A. [ data ] [ embedded file ] [ data ] [ embedded file ] - * or - * B. [ data ] [ embedded compressed archive w. recognizable magic bytes ] - * or - * C. [ data ] [ embedded uncompressed archive w. multiple file entries [ file 1 ] [ file 2 ] [ file 2 ] ] - * - * Some ideas for a more accurate solution: - * - * 1. Record the offset and size of each file extracted by the parsers. - * Then, when we do embedded file type recognition, we can check if the offset and size of the - * embedded file matches the offset and size of a file that was extracted by a parser. - * This falls apart a little bit for multiple layers of archives unless we also compare offsets within - * each layer. We could do that, but it would be a lot of work. And we'd probably want to take into - * consideration if files were decompressed or decrypted. ... I don't know a clean solution. - * - * 2. Have all parsers to run before embedded file type recognition and they each determine the length - * of the file they parsed, so we can differentiate between embedded files and appended files. - * For appended files, we would know they weren't extracted by a parser module and the parser for - * each of those would report the length of the file it parsed so we can use that to mitigate - * overlapping embedded file type recognition. - * But I highly doubt all file types can be parsed to determine the correct length of the file. - */ - for (i = ctx->recursion_level; i > 0; i--) { - if (ctx->recursion_stack[i].attributes & LAYER_ATTRIBUTES_EMBEDDED) { - // Found an ancestor that was embedded. - // Do not scan embedded files again. - ancestor_was_embedded = true; - break; - } - } - /* * Next, check for actual embedded files. */ - if ((false == ancestor_was_embedded) && - (false == type_has_been_handled)) { + if (false == type_has_been_handled) { cli_dbgmsg("%s signature found at %u\n", cli_ftname(fpt->type), (unsigned int)fpt->offset); type_has_been_handled = true; @@ -4001,6 +3936,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi break; } + // Increment last_offset to ignore any file type matches that occured within this legitimate archive. + last_offset += zip_size - 1; // Note: size is definitely > 0 because header_check succeeded. + nret = cli_magic_scan_nested_fmap_type( ctx->fmap, fpt->offset, @@ -4023,6 +3961,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi break; } + // Increment last_offset to ignore any file type matches that occured within this legitimate archive. + last_offset += cab_size - 1; // Note: size is definitely > 0 because header_check succeeded. + nret = cli_magic_scan_nested_fmap_type( ctx->fmap, fpt->offset, @@ -4046,6 +3987,9 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi break; } + // Increment last_offset to ignore any file type matches that occured within this legitimate archive. + last_offset += arj_size - 1; // Note: size is definitely > 0 because header_check succeeded. + nret = cli_magic_scan_nested_fmap_type( ctx->fmap, fpt->offset, diff --git a/libclamav/unzip.c b/libclamav/unzip.c index 90920ec5b..e9b912da4 100644 --- a/libclamav/unzip.c +++ b/libclamav/unzip.c @@ -717,17 +717,35 @@ static cl_error_t parse_local_file_header( zip += LOCAL_HEADER_elen; bytes_remaining -= LOCAL_HEADER_elen; - if (!csize) { /* FIXME: what's used for method0 files? csize or usize? Nothing in the specs, needs testing */ - cli_dbgmsg("cli_unzip: local header - skipping empty file\n"); - } else { - if (bytes_remaining < csize) { - cli_dbgmsg("cli_unzip: local header - stream out of file\n"); - status = CL_EPARSE; - goto done; - } + if (bytes_remaining < csize) { + cli_dbgmsg("cli_unzip: local header - stream out of file\n"); + status = CL_EPARSE; + goto done; + } + if (NULL != record) { /* Don't actually unzip if we're just collecting the file record information (offset, sizes) */ - if (NULL == record) { + if (NULL == original_filename) { + record->original_filename = NULL; + } else { + record->original_filename = CLI_STRNDUP(original_filename, strlen(original_filename)); + } + record->local_header_offset = loff; + record->local_header_size = zip - local_header; + record->compressed_size = csize; + record->uncompressed_size = usize; + record->method = LOCAL_HEADER_method; + record->flags = LOCAL_HEADER_flags; + record->encrypted = (LOCAL_HEADER_flags & F_ENCR) ? 1 : 0; + + status = CL_SUCCESS; + } else { + /* + * Unzip or decompress & then unzip. + */ + if (!csize) { /* FIXME: what's used for method0 files? csize or usize? Nothing in the specs, needs testing */ + cli_dbgmsg("cli_unzip: local header - skipping empty file\n"); + } else { zip = fmap_need_ptr_once(ctx->fmap, zip, csize); if (NULL == zip) { cli_dbgmsg("cli_unzip: local header - data out of file\n"); @@ -751,27 +769,12 @@ static cl_error_t parse_local_file_header( goto done; } } - } else { - if (NULL == original_filename) { - record->original_filename = NULL; - } else { - record->original_filename = CLI_STRNDUP(original_filename, strlen(original_filename)); - } - record->local_header_offset = loff; - record->local_header_size = zip - local_header; - record->compressed_size = csize; - record->uncompressed_size = usize; - record->method = LOCAL_HEADER_method; - record->flags = LOCAL_HEADER_flags; - record->encrypted = (LOCAL_HEADER_flags & F_ENCR) ? 1 : 0; - - status = CL_SUCCESS; } - - zip += csize; - bytes_remaining -= csize; } + zip += csize; + bytes_remaining -= csize; + if (LOCAL_HEADER_flags & F_USEDD) { if (bytes_remaining < 12) { cli_dbgmsg("cli_unzip: local header - data desc out of file\n"); @@ -910,8 +913,8 @@ static cl_error_t parse_central_directory_file_header( central_header = fmap_need_off(ctx->fmap, central_file_header_offset, SIZEOF_CENTRAL_HEADER); if (NULL == central_header) { - cli_dbgmsg("cli_unzip: central header - file header offset out of file\n"); - status = CL_EPARSE; + cli_dbgmsg("cli_unzip: central header - reached end of central directory.\n"); + status = CL_BREAK; goto done; } @@ -1310,7 +1313,8 @@ cl_error_t index_local_file_headers_within_bounds( index = *num_records; if (start_offset > fsize || end_offset > fsize || start_offset > end_offset) { - cli_errmsg("index_local_file_headers_within_bounds: Invalid offset arguments\n"); + cli_errmsg("index_local_file_headers_within_bounds: Invalid offset arguments: start_offset=%u, end_offset=%u, fsize=%u\n", + start_offset, end_offset, fsize); status = CL_EPARSE; goto done; } @@ -2075,6 +2079,9 @@ cl_error_t unzip_search(cli_ctx *ctx, struct zip_requests *requests) status = CL_ETIMEOUT; goto done; } + + // Increment to the next central file header. + central_file_header_offset += file_record_size; } while ((ret == CL_SUCCESS) && (file_record_size > 0)); } else { cli_dbgmsg("unzip_search: Cannot locate central directory. unzip_search failed.\n"); @@ -2106,7 +2113,7 @@ cl_error_t unzip_search_single(cli_ctx *ctx, const char *name, size_t nlen, uint // Search for the zip file entry in the current layer. status = unzip_search(ctx, &requests); - if (CL_SUCCESS == status) { + if (CL_VIRUS == status) { *loff = requests.loff; }