clamav/libclamav/xar.c

886 lines
34 KiB
C
Raw Permalink Normal View History

/*
2025-02-14 10:24:30 -05:00
* Copyright (C) 2013-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
* Copyright (C) 2013 Sourcefire, Inc.
*
2013-09-13 12:22:58 -04:00
* Authors: Steven Morgan <smorgan@sourcefire.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
2013-09-13 12:22:58 -04:00
#include <errno.h>
#include "xar.h"
#include "fmap.h"
2019-03-26 15:09:52 -04:00
2013-09-13 12:22:58 -04:00
#include <libxml/xmlreader.h>
2014-07-01 19:38:01 -04:00
#include "clamav.h"
#include "str.h"
#include "scanners.h"
2013-09-13 12:22:58 -04:00
#include "inflate64.h"
#include "lzma_iface.h"
2013-09-13 12:22:58 -04:00
/*
xar_cleanup_temp_file - cleanup after cli_gentempfd
parameters:
ctx - cli_ctx context pointer
fd - fd to close
tmpname - name of file to unlink, address of storage to free
returns - CL_SUCCESS or CL_EUNLINK
*/
static int xar_cleanup_temp_file(cli_ctx *ctx, int fd, char *tmpname)
2013-09-13 12:22:58 -04:00
{
int rc = CL_SUCCESS;
if (fd > -1)
close(fd);
if (tmpname != NULL) {
if (!ctx->engine->keeptmp) {
if (cli_unlink(tmpname)) {
cli_dbgmsg("cli_scanxar: error unlinking tmpfile %s\n", tmpname);
rc = CL_EUNLINK;
}
2013-09-13 12:22:58 -04:00
}
free(tmpname);
2013-09-13 12:22:58 -04:00
}
return rc;
}
/*
xar_get_numeric_from_xml_element - extract xml element value as numeric
parameters:
reader - xmlTextReaderPtr
value - pointer to long to contain the returned value
returns - CL_SUCCESS or CL_EFORMAT
*/
static int xar_get_numeric_from_xml_element(xmlTextReaderPtr reader, size_t *value)
2013-09-13 12:22:58 -04:00
{
const xmlChar *numstr;
2016-06-21 17:22:52 -04:00
2013-09-13 12:22:58 -04:00
if (xmlTextReaderRead(reader) == 1 && xmlTextReaderNodeType(reader) == XML_READER_TYPE_TEXT) {
numstr = xmlTextReaderConstValue(reader);
if (numstr) {
long numval;
2018-02-23 17:15:40 -07:00
char *endptr = NULL;
errno = 0;
numval = strtol((const char *)numstr, &endptr, 10);
if ((((numval == LONG_MAX) || (numval == LONG_MIN)) && errno) ||
((const xmlChar *)endptr == numstr)) {
cli_dbgmsg("cli_scanxar: XML element value invalid\n");
return CL_EFORMAT;
} else if (numval < 0) {
cli_dbgmsg("cli_scanxar: XML element value %li\n", numval);
return CL_EFORMAT;
}
2016-06-21 17:22:52 -04:00
*value = numval;
return CL_SUCCESS;
2013-09-13 12:22:58 -04:00
}
}
cli_dbgmsg("cli_scanxar: No text for XML element\n");
2013-09-13 12:22:58 -04:00
return CL_EFORMAT;
}
/*
xar_get_checksum_values - extract checksum and hash algorithm from xml element
parameters:
reader - xmlTextReaderPtr
cksum - pointer to char* for returning checksum value.
hash - pointer to int for returning checksum algorithm.
returns - void
*/
static void xar_get_checksum_values(xmlTextReaderPtr reader, unsigned char **cksum, int *hash)
{
xmlChar *style = xmlTextReaderGetAttribute(reader, (const xmlChar *)"style");
const xmlChar *xmlval;
*hash = XAR_CKSUM_NONE;
if (style == NULL) {
cli_dbgmsg("cli_scaxar: xmlTextReaderGetAttribute no style attribute "
"for checksum element\n");
} else {
cli_dbgmsg("cli_scanxar: checksum algorithm is %s.\n", style);
2013-09-23 16:18:54 -04:00
if (0 == xmlStrcasecmp(style, (const xmlChar *)"sha1")) {
*hash = XAR_CKSUM_SHA1;
2013-09-23 16:18:54 -04:00
} else if (0 == xmlStrcasecmp(style, (const xmlChar *)"md5")) {
*hash = XAR_CKSUM_MD5;
} else {
cli_dbgmsg("cli_scanxar: checksum algorithm %s is unsupported.\n", style);
*hash = XAR_CKSUM_OTHER;
}
}
2014-02-05 11:00:23 -05:00
if (style != NULL)
xmlFree(style);
if (xmlTextReaderRead(reader) == 1 && xmlTextReaderNodeType(reader) == XML_READER_TYPE_TEXT) {
2013-10-11 12:04:29 -04:00
xmlval = xmlTextReaderConstValue(reader);
if (xmlval) {
2016-06-21 17:22:52 -04:00
cli_dbgmsg("cli_scanxar: checksum value is %s.\n", xmlval);
Swap clean cache from MD5 to SHA2-256 Change the clean-cache to use SHA2-256 instead of MD5. Note that all references are changed to specify "SHA2-256" now instead of "SHA256", for clarity. But there is no plan to add support for SHA3 algorithms at this time. Significant code cleanup. E.g.: - Implemented goto-done error handling. - Used `uint8_t *` instead of `unsigned char *`. - Use `bool` for boolean checks, rather than `int. - Used `#defines` instead of magic numbers. - Removed duplicate `#defines` for things like hash length. Add new option to calculate and record additional hash types when the "generate metadata JSON" feature is enabled: - libclamav option: `CL_SCAN_GENERAL_STORE_EXTRA_HASHES` - clamscan option: `--json-store-extra-hashes` (default off) - clamd.conf option: `JsonStoreExtraHashes` (default 'no') Renamed the sigtool option `--sha256` to `--sha2-256`. The original option is still functional, but is deprecated. For the "generate metadata JSON" feature, the file hash is now stored as "sha2-256" instead of "FileMD5". If you enable the "extra hashes" option, then it will also record "md5" and "sha1". Deprecate and disable the internal "SHA collect" feature. This option had been hidden behind C #ifdef checks for an option that wasn't exposed through CMake, so it was basically unavailable anyways. Changes to calculate file hashes when they're needed and no sooner. For the FP feature in the matcher module, I have mimiced the optimization in the FMAP scan routine which makes it so that it can calculate multiple hashes in a single pass of the file. The `HandlerType` feature stores a hash of the file in the scan ctx to prevent retyping the exact same data more than once. I removed that hash field and replaced it with an attribute flag that is applied to the new recursion stack layer when retyping a file. This also closes a minor bug that would prevent retyping a file with an all-zero hash. :) The work upgrading cache.c to support SHA2-256 sized hashes thanks to: https://github.com/m-sola CLAM-255 CLAM-1858 CLAM-1859 CLAM-1860
2025-06-03 19:03:20 -04:00
if (((*hash == XAR_CKSUM_SHA1) && (xmlStrlen(xmlval) == 2 * SHA1_HASH_SIZE)) ||
((*hash == XAR_CKSUM_MD5) && (xmlStrlen(xmlval) == 2 * MD5_HASH_SIZE))) {
*cksum = xmlStrdup(xmlval);
} else {
cli_dbgmsg("cli_scanxar: checksum type is unknown or length is invalid.\n");
*hash = XAR_CKSUM_OTHER;
*cksum = NULL;
}
} else {
*cksum = NULL;
cli_dbgmsg("cli_scanxar: xmlTextReaderConstValue() returns NULL for checksum value.\n");
}
} else
cli_dbgmsg("cli_scanxar: No text for XML checksum element.\n");
}
2013-09-13 12:22:58 -04:00
/*
xar_get_toc_data_values - return the values of a <data> or <ea> xml element that represent
2013-09-13 12:22:58 -04:00
an extent of data on the heap.
parameters:
reader - xmlTextReaderPtr
length - pointer to long for returning value of the <length> element.
offset - pointer to long for returning value of the <offset> element.
size - pointer to long for returning value of the <size> element.
encoding - pointer to int for returning indication of the <encoding> style attribute.
a_cksum - pointer to char* for return archived checksum value.
a_hash - pointer to int for returning archived checksum algorithm.
e_cksum - pointer to char* for return extracted checksum value.
e_hash - pointer to int for returning extracted checksum algorithm.
2013-09-13 12:22:58 -04:00
returns - CL_FORMAT, CL_SUCCESS, CL_BREAK. CL_BREAK indicates no more <data>/<ea> element.
*/
2016-06-21 17:22:52 -04:00
static int xar_get_toc_data_values(xmlTextReaderPtr reader, size_t *length, size_t *offset, size_t *size, int *encoding,
unsigned char **a_cksum, int *a_hash, unsigned char **e_cksum, int *e_hash)
2013-09-13 12:22:58 -04:00
{
const xmlChar *name;
int indata = 0, inea = 0;
int rc, gotoffset = 0, gotlength = 0, gotsize = 0;
2013-09-13 12:22:58 -04:00
*a_cksum = NULL;
*a_hash = XAR_CKSUM_NONE;
*e_cksum = NULL;
*e_hash = XAR_CKSUM_NONE;
*encoding = CL_TYPE_ANY;
2013-09-13 12:22:58 -04:00
rc = xmlTextReaderRead(reader);
while (rc == 1) {
name = xmlTextReaderConstLocalName(reader);
if (indata || inea) {
/* cli_dbgmsg("cli_scanxar: xmlTextReaderRead read %s\n", name); */
if (xmlStrEqual(name, (const xmlChar *)"offset") &&
2013-09-13 12:22:58 -04:00
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
if (CL_SUCCESS == xar_get_numeric_from_xml_element(reader, offset))
gotoffset = 1;
2013-09-13 12:22:58 -04:00
} else if (xmlStrEqual(name, (const xmlChar *)"length") &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
if (CL_SUCCESS == xar_get_numeric_from_xml_element(reader, length))
gotlength = 1;
2013-09-13 12:22:58 -04:00
} else if (xmlStrEqual(name, (const xmlChar *)"size") &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
if (CL_SUCCESS == xar_get_numeric_from_xml_element(reader, size))
gotsize = 1;
2013-09-13 12:22:58 -04:00
} else if (xmlStrEqual(name, (const xmlChar *)"archived-checksum") &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
cli_dbgmsg("cli_scanxar: <archived-checksum>:\n");
xar_get_checksum_values(reader, a_cksum, a_hash);
} else if ((xmlStrEqual(name, (const xmlChar *)"extracted-checksum") ||
xmlStrEqual(name, (const xmlChar *)"unarchived-checksum")) &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
cli_dbgmsg("cli_scanxar: <extracted-checksum>:\n");
xar_get_checksum_values(reader, e_cksum, e_hash);
2013-09-13 12:22:58 -04:00
} else if (xmlStrEqual(name, (const xmlChar *)"encoding") &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
xmlChar *style = xmlTextReaderGetAttribute(reader, (const xmlChar *)"style");
2013-09-13 12:22:58 -04:00
if (style == NULL) {
cli_dbgmsg("cli_scaxar: xmlTextReaderGetAttribute no style attribute "
2013-09-13 12:22:58 -04:00
"for encoding element\n");
*encoding = CL_TYPE_ANY;
} else if (xmlStrEqual(style, (const xmlChar *)"application/x-gzip")) {
cli_dbgmsg("cli_scanxar: encoding = application/x-gzip.\n");
*encoding = CL_TYPE_GZ;
2013-09-13 12:22:58 -04:00
} else if (xmlStrEqual(style, (const xmlChar *)"application/octet-stream")) {
cli_dbgmsg("cli_scanxar: encoding = application/octet-stream.\n");
*encoding = CL_TYPE_ANY;
2013-09-13 12:22:58 -04:00
} else if (xmlStrEqual(style, (const xmlChar *)"application/x-bzip2")) {
cli_dbgmsg("cli_scanxar: encoding = application/x-bzip2.\n");
*encoding = CL_TYPE_BZ;
} else if (xmlStrEqual(style, (const xmlChar *)"application/x-lzma")) {
cli_dbgmsg("cli_scanxar: encoding = application/x-lzma.\n");
*encoding = CL_TYPE_7Z;
} else if (xmlStrEqual(style, (const xmlChar *)"application/x-xz")) {
2013-09-13 12:22:58 -04:00
cli_dbgmsg("cli_scanxar: encoding = application/x-xz.\n");
2013-10-08 17:17:44 -04:00
*encoding = CL_TYPE_XZ;
2013-09-13 12:22:58 -04:00
} else {
cli_dbgmsg("cli_scaxar: unknown style value=%s for encoding element\n", style);
2013-09-13 12:22:58 -04:00
*encoding = CL_TYPE_ANY;
}
2014-02-05 11:00:23 -05:00
if (style != NULL)
xmlFree(style);
} else if (indata && xmlStrEqual(name, (const xmlChar *)"data") &&
2013-09-13 12:22:58 -04:00
xmlTextReaderNodeType(reader) == XML_READER_TYPE_END_ELEMENT) {
break;
2013-09-13 12:22:58 -04:00
} else if (inea && xmlStrEqual(name, (const xmlChar *)"ea") &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_END_ELEMENT) {
break;
}
2013-09-13 12:22:58 -04:00
} else {
if (xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
if (xmlStrEqual(name, (const xmlChar *)"data")) {
cli_dbgmsg("cli_scanxar: xmlTextReaderRead read <data>\n");
2013-09-13 12:22:58 -04:00
indata = 1;
} else if (xmlStrEqual(name, (const xmlChar *)"ea")) {
cli_dbgmsg("cli_scanxar: xmlTextReaderRead read <ea>\n");
2013-09-13 12:22:58 -04:00
inea = 1;
}
} else if ((xmlTextReaderNodeType(reader) == XML_READER_TYPE_END_ELEMENT) &&
xmlStrEqual(name, (const xmlChar *)"xar")) {
cli_dbgmsg("cli_scanxar: finished parsing xar TOC.\n");
break;
2013-09-13 12:22:58 -04:00
}
}
rc = xmlTextReaderRead(reader);
}
2013-09-13 12:22:58 -04:00
if (gotoffset && gotlength && gotsize) {
rc = CL_SUCCESS;
} else if (0 == gotoffset + gotlength + gotsize)
2013-09-13 12:22:58 -04:00
rc = CL_BREAK;
else
rc = CL_EFORMAT;
return rc;
}
/*
xar_process_subdocument - check TOC for xml subdocument. If found, extract and
scan in memory.
Parameters:
reader - xmlTextReaderPtr
ctx - pointer to cli_ctx
Returns:
CL_SUCCESS - subdoc found and clean scan (or virus found and SCAN_ALLMATCHES), or no subdocument
other - error return code from cli_magic_scan_buff()
*/
static int xar_scan_subdocuments(xmlTextReaderPtr reader, cli_ctx *ctx)
{
int rc = CL_SUCCESS, subdoc_len, fd;
xmlChar *subdoc;
const xmlChar *name;
char *tmpname;
while (xmlTextReaderRead(reader) == 1) {
name = xmlTextReaderConstLocalName(reader);
if (name == NULL) {
cli_dbgmsg("cli_scanxar: xmlTextReaderConstLocalName() no name.\n");
rc = CL_EFORMAT;
break;
}
if (xmlStrEqual(name, (const xmlChar *)"toc") &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT)
return CL_SUCCESS;
if (xmlStrEqual(name, (const xmlChar *)"subdoc") &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
subdoc = xmlTextReaderReadInnerXml(reader);
if (subdoc == NULL) {
cli_dbgmsg("cli_scanxar: no content in subdoc element.\n");
xmlTextReaderNext(reader);
continue;
}
subdoc_len = xmlStrlen(subdoc);
cli_dbgmsg("cli_scanxar: in-memory scan of xml subdocument, len %i.\n", subdoc_len);
rc = cli_magic_scan_buff(subdoc, subdoc_len, ctx, NULL, LAYER_ATTRIBUTES_NONE);
/* make a file to leave if --leave-temps in effect */
if (ctx->engine->keeptmp) {
libclamav: Add engine option to toggle temp directory recursion Temp directory recursion in ClamAV is when each layer of a scan gets its own temp directory in the parent layer's temp directory. In addition to temp directory recursion, ClamAV has been creating a new subdirectory for each file scan as a risk-adverse method to ensure no temporary file leaks fill up the disk. Creating a directory is relatively slow on Windows in particular if scanning a lot of very small files. This commit: 1. Separates the temp directory recursion feature from the leave-temps feature so that libclamav can leave temp files without making subdirectories for each file scanned. 2. Makes it so that when temp directory recursion is off, libclamav will just use the configure temp directory for all files. The new option to enable temp directory recursion is for libclamav-only at this time. It is off by default, and you can enable it like this: ```c cl_engine_set_num(engine, CL_ENGINE_TMPDIR_RECURSION, 1); ``` For the `clamscan` and `clamd` programs, temp directory recursion will be enabled when `--leave-temps` / `LeaveTemporaryFiles` is enabled. The difference is that when disabled, it will return to using the configured temp directory without making a subdirectory for each file scanned, so as to improve scan performance for small files, mostly on Windows. Under the hood, this commit also: 1. Cleans up how we keep track of tmpdirs for each layer. The goal here is to align how we keep track of layer-specific stuff using the scan_layer structure. 2. Cleans up how we record metadata JSON for embedded files. Note: Embedded files being different from Contained files, as they are extracted not with a parser, but by finding them with file type magic signatures. CLAM-1583
2025-06-09 20:42:31 -04:00
if ((rc = cli_gentempfd(ctx->this_layer_tmpdir, &tmpname, &fd)) != CL_SUCCESS) {
cli_dbgmsg("cli_scanxar: Can't create temporary file for subdocument.\n");
} else {
cli_dbgmsg("cli_scanxar: Writing subdoc to temp file %s.\n", tmpname);
if (cli_writen(fd, subdoc, subdoc_len) == (size_t)-1) {
cli_dbgmsg("cli_scanxar: cli_writen error writing subdoc temporary file.\n");
rc = CL_EWRITE;
}
rc = xar_cleanup_temp_file(ctx, fd, tmpname);
tmpname = NULL;
}
}
xmlFree(subdoc);
if (rc != CL_SUCCESS)
return rc;
xmlTextReaderNext(reader);
}
}
return rc;
}
static void *xar_hash_init(int hash, void **sc, void **mc)
{
if (!sc && !mc)
return NULL;
switch (hash) {
case XAR_CKSUM_SHA1:
*sc = cl_hash_init("sha1");
if (!(*sc)) {
return NULL;
}
return *sc;
case XAR_CKSUM_MD5:
*mc = cl_hash_init("md5");
if (!(*mc)) {
return NULL;
}
return *mc;
case XAR_CKSUM_OTHER:
case XAR_CKSUM_NONE:
default:
return NULL;
}
}
static void xar_hash_update(void *hash_ctx, void *data, unsigned long size, int hash)
{
if (!hash_ctx || !data || !size)
return;
2014-02-08 00:31:12 -05:00
switch (hash) {
case XAR_CKSUM_NONE:
case XAR_CKSUM_OTHER:
return;
}
2014-02-08 00:31:12 -05:00
cl_update_hash(hash_ctx, data, size);
}
static void xar_hash_final(void *hash_ctx, void *result, int hash)
{
if (!hash_ctx || !result)
return;
2014-02-08 00:31:12 -05:00
switch (hash) {
case XAR_CKSUM_OTHER:
case XAR_CKSUM_NONE:
return;
}
2014-02-08 00:31:12 -05:00
cl_finish_hash(hash_ctx, result);
}
static int xar_hash_check(int hash, const void *result, const void *expected)
{
int len;
if (!result || !expected)
return 1;
switch (hash) {
case XAR_CKSUM_SHA1:
Swap clean cache from MD5 to SHA2-256 Change the clean-cache to use SHA2-256 instead of MD5. Note that all references are changed to specify "SHA2-256" now instead of "SHA256", for clarity. But there is no plan to add support for SHA3 algorithms at this time. Significant code cleanup. E.g.: - Implemented goto-done error handling. - Used `uint8_t *` instead of `unsigned char *`. - Use `bool` for boolean checks, rather than `int. - Used `#defines` instead of magic numbers. - Removed duplicate `#defines` for things like hash length. Add new option to calculate and record additional hash types when the "generate metadata JSON" feature is enabled: - libclamav option: `CL_SCAN_GENERAL_STORE_EXTRA_HASHES` - clamscan option: `--json-store-extra-hashes` (default off) - clamd.conf option: `JsonStoreExtraHashes` (default 'no') Renamed the sigtool option `--sha256` to `--sha2-256`. The original option is still functional, but is deprecated. For the "generate metadata JSON" feature, the file hash is now stored as "sha2-256" instead of "FileMD5". If you enable the "extra hashes" option, then it will also record "md5" and "sha1". Deprecate and disable the internal "SHA collect" feature. This option had been hidden behind C #ifdef checks for an option that wasn't exposed through CMake, so it was basically unavailable anyways. Changes to calculate file hashes when they're needed and no sooner. For the FP feature in the matcher module, I have mimiced the optimization in the FMAP scan routine which makes it so that it can calculate multiple hashes in a single pass of the file. The `HandlerType` feature stores a hash of the file in the scan ctx to prevent retyping the exact same data more than once. I removed that hash field and replaced it with an attribute flag that is applied to the new recursion stack layer when retyping a file. This also closes a minor bug that would prevent retyping a file with an all-zero hash. :) The work upgrading cache.c to support SHA2-256 sized hashes thanks to: https://github.com/m-sola CLAM-255 CLAM-1858 CLAM-1859 CLAM-1860
2025-06-03 19:03:20 -04:00
len = SHA1_HASH_SIZE;
break;
case XAR_CKSUM_MD5:
Swap clean cache from MD5 to SHA2-256 Change the clean-cache to use SHA2-256 instead of MD5. Note that all references are changed to specify "SHA2-256" now instead of "SHA256", for clarity. But there is no plan to add support for SHA3 algorithms at this time. Significant code cleanup. E.g.: - Implemented goto-done error handling. - Used `uint8_t *` instead of `unsigned char *`. - Use `bool` for boolean checks, rather than `int. - Used `#defines` instead of magic numbers. - Removed duplicate `#defines` for things like hash length. Add new option to calculate and record additional hash types when the "generate metadata JSON" feature is enabled: - libclamav option: `CL_SCAN_GENERAL_STORE_EXTRA_HASHES` - clamscan option: `--json-store-extra-hashes` (default off) - clamd.conf option: `JsonStoreExtraHashes` (default 'no') Renamed the sigtool option `--sha256` to `--sha2-256`. The original option is still functional, but is deprecated. For the "generate metadata JSON" feature, the file hash is now stored as "sha2-256" instead of "FileMD5". If you enable the "extra hashes" option, then it will also record "md5" and "sha1". Deprecate and disable the internal "SHA collect" feature. This option had been hidden behind C #ifdef checks for an option that wasn't exposed through CMake, so it was basically unavailable anyways. Changes to calculate file hashes when they're needed and no sooner. For the FP feature in the matcher module, I have mimiced the optimization in the FMAP scan routine which makes it so that it can calculate multiple hashes in a single pass of the file. The `HandlerType` feature stores a hash of the file in the scan ctx to prevent retyping the exact same data more than once. I removed that hash field and replaced it with an attribute flag that is applied to the new recursion stack layer when retyping a file. This also closes a minor bug that would prevent retyping a file with an all-zero hash. :) The work upgrading cache.c to support SHA2-256 sized hashes thanks to: https://github.com/m-sola CLAM-255 CLAM-1858 CLAM-1859 CLAM-1860
2025-06-03 19:03:20 -04:00
len = MD5_HASH_SIZE;
break;
case XAR_CKSUM_OTHER:
case XAR_CKSUM_NONE:
default:
return 1;
}
return memcmp(result, expected, len);
}
2013-09-13 12:22:58 -04:00
/*
cli_scanxar - scan an xar archive.
Parameters:
ctx - pointer to cli_ctx.
returns - CL_SUCCESS or CL_ error code.
*/
int cli_scanxar(cli_ctx *ctx)
{
int rc = CL_SUCCESS;
unsigned int cksum_fails = 0;
unsigned int extract_errors = 0;
2013-09-13 12:22:58 -04:00
int fd = -1;
struct xar_header hdr;
libclamav: Fix scan recursion tracking Scan recursion is the process of identifying files embedded in other files and then scanning them, recursively. Internally this process is more complex than it may sound because a file may have multiple layers of types before finding a new "file". At present we treat the recursion count in the scanning context as an index into both our fmap list AND our container list. These two lists are conceptually a part of the same thing and should be unified. But what's concerning is that the "recursion level" isn't actually incremented or decremented at the same time that we add a layer to the fmap or container lists but instead is more touchy-feely, increasing when we find a new "file". To account for this shadiness, the size of the fmap and container lists has always been a little longer than our "max scan recursion" limit so we don't accidentally overflow the fmap or container arrays (!). I've implemented a single recursion-stack as an array, similar to before, which includes a pointer to each fmap at each layer, along with the size and type. Push and pop functions add and remove layers whenever a new fmap is added. A boolean argument when pushing indicates if the new layer represents a new buffer or new file (descriptor). A new buffer will reset the "nested fmap level" (described below). This commit also provides a solution for an issue where we detect embedded files more than once during scan recursion. For illustration, imagine a tarball named foo.tar.gz with this structure: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.tar.gz | GZ | 0 | 0 | | └── foo.tar | TAR | 1 | 0 | | ├── bar.zip | ZIP | 2 | 1 | | │   └── hola.txt | ASCII | 3 | 0 | | └── baz.exe | PE | 2 | 1 | But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | baz.exe | PE | 0 | 0 | | ├── sfx.zip | ZIP | 1 | 1 | | │   └── hello.txt | ASCII | 2 | 0 | | └── sfx.7z | 7Z | 1 | 1 | |    └── world.txt | ASCII | 2 | 0 | (A) If we scan for embedded files at any layer, we may detect: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.tar.gz | GZ | 0 | 0 | | ├── foo.tar | TAR | 1 | 0 | | │ ├── bar.zip | ZIP | 2 | 1 | | │ │   └── hola.txt | ASCII | 3 | 0 | | │ ├── baz.exe | PE | 2 | 1 | | │ │ ├── sfx.zip | ZIP | 3 | 1 | | │ │ │   └── hello.txt | ASCII | 4 | 0 | | │ │ └── sfx.7z | 7Z | 3 | 1 | | │ │    └── world.txt | ASCII | 4 | 0 | | │ ├── sfx.zip | ZIP | 2 | 1 | | │ │   └── hello.txt | ASCII | 3 | 0 | | │ └── sfx.7z | 7Z | 2 | 1 | | │   └── world.txt | ASCII | 3 | 0 | | ├── sfx.zip | ZIP | 1 | 1 | | └── sfx.7z | 7Z | 1 | 1 | (A) is bad because it scans content more than once. Note that for the GZ layer, it may detect the ZIP and 7Z if the signature hits on the compressed data, which it might, though extracting the ZIP and 7Z will likely fail. The reason the above doesn't happen now is that we restrict embedded type scans for a bunch of archive formats to include GZ and TAR. (B) If we scan for embedded files at the foo.tar layer, we may detect: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.tar.gz | GZ | 0 | 0 | | └── foo.tar | TAR | 1 | 0 | | ├── bar.zip | ZIP | 2 | 1 | | │   └── hola.txt | ASCII | 3 | 0 | | ├── baz.exe | PE | 2 | 1 | | ├── sfx.zip | ZIP | 2 | 1 | | │   └── hello.txt | ASCII | 3 | 0 | | └── sfx.7z | 7Z | 2 | 1 | |    └── world.txt | ASCII | 3 | 0 | (B) is almost right. But we can achieve it easily enough only scanning for embedded content in the current fmap when the "nested fmap level" is 0. The upside is that it should safely detect all embedded content, even if it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe. The biggest risk I can think of affects ZIPs. SFXZIP detection is identical to ZIP detection, which is why we don't allow SFXZIP to be detected if insize of a ZIP. If we only allow embedded type scanning at fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP if the bar.exe was not compressed in foo.zip and if non-compressed files extracted from ZIPs aren't extracted as new buffers: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.zip | ZIP | 0 | 0 | | └── bar.exe | PE | 1 | 1 | | └── sfx.zip | ZIP | 2 | 2 | Provided that we ensure all files extracted from zips are scanned in new buffers, option (B) should be safe. (C) If we scan for embedded files at the baz.exe layer, we may detect: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.tar.gz | GZ | 0 | 0 | | └── foo.tar | TAR | 1 | 0 | | ├── bar.zip | ZIP | 2 | 1 | | │   └── hola.txt | ASCII | 3 | 0 | | └── baz.exe | PE | 2 | 1 | | ├── sfx.zip | ZIP | 3 | 1 | | │   └── hello.txt | ASCII | 4 | 0 | | └── sfx.7z | 7Z | 3 | 1 | |    └── world.txt | ASCII | 4 | 0 | (C) is right. But it's harder to achieve. For this example we can get it by restricting 7ZSFX and ZIPSFX detection only when scanning an executable. But that may mean losing detection of archives embedded elsewhere. And we'd have to identify allowable container types for each possible embedded type, which would be very difficult. So this commit aims to solve the issue the (B)-way. Note that in all situations, we still have to scan with file typing enabled to determine if we need to reassign the current file type, such as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2- compressed. Detection of DMG and a handful of other types rely on finding data partway through or near the ned of a file before reassigning the entire file as the new type. Other fixes and considerations in this commit: - The utf16 HTML parser has weak error handling, particularly with respect to creating a nested fmap for scanning the ascii decoded file. This commit cleans up the error handling and wraps the nested scan with the recursion-stack push()/pop() for correct recursion tracking. Before this commit, each container layer had a flag to indicate if the container layer is valid. We need something similar so that the cli_recursion_stack_get_*() functions ignore normalized layers. Details... Imagine an LDB signature for HTML content that specifies a ZIP container. If the signature actually alerts on the normalized HTML and you don't ignore normalized layers for the container check, it will appear as though the alert is in an HTML container rather than a ZIP container. This commit accomplishes this with a boolean you set in the scan context before scanning a new layer. Then when the new fmap is created, it will use that flag to set similar flag for the layer. The context flag is reset those that anything after this doesn't have that flag. The flag allows the new recursion_stack_get() function to ignore normalized layers when iterating the stack to return a layer at a requested index, negative or positive. Scanning normalized extracted/normalized javascript and VBA should also use the 'layer is normalized' flag. - This commit also fixes Heuristic.Broken.Executable alert for ELF files to make sure that: A) these only alert if cli_append_virus() returns CL_VIRUS (aka it respects the FP check). B) all broken-executable alerts for ELF only happen if the SCAN_HEURISTIC_BROKEN option is enabled. - This commit also cleans up the error handling in cli_magic_scan_dir(). This was needed so we could correctly apply the layer-is-normalized-flag to all VBA macros extracted to a directory when scanning the directory. - Also fix an issue where exceeding scan maximums wouldn't cause embedded file detection scans to abort. Granted we don't actually want to abort if max filesize or max recursion depth are exceeded... only if max scansize, max files, and max scantime are exceeded. Add 'abort_scan' flag to scan context, to protect against depending on correct error propagation for fatal conditions. Instead, setting this flag in the scan context should guarantee that a fatal condition deep in scan recursion isn't lost which result in more stuff being scanned instead of aborting. This shouldn't be necessary, but some status codes like CL_ETIMEOUT never used to be fatal and it's easier to do this than to verify every parser only returns CL_ETIMEOUT and other "fatal status codes" in fatal conditions. - Remove duplicate is_tar() prototype from filestypes.c and include is_tar.h instead. - Presently we create the fmap hash when creating the fmap. This wastes a bit of CPU if the hash is never needed. Now that we're creating fmap's for all embedded files discovered with file type recognition scans, this is a much more frequent occurence and really slows things down. This commit fixes the issue by only creating fmap hashes as needed. This should not only resolve the perfomance impact of creating fmap's for all embedded files, but also should improve performance in general. - Add allmatch check to the zip parser after the central-header meta match. That way we don't multiple alerts with the same match except in allmatch mode. Clean up error handling in the zip parser a tiny bit. - Fixes to ensure that the scan limits such as scansize, filesize, recursion depth, # of embedded files, and scantime are always reported if AlertExceedsMax (--alert-exceeds-max) is enabled. - Fixed an issue where non-fatal alerts for exceeding scan maximums may mask signature matches later on. I changed it so these alerts use the "possibly unwanted" alert-type and thus only alert if no other alerts were found or if all-match or heuristic-precedence are enabled. - Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata when the --gen-json feature is enabled. These will show up once under "ParseErrors" the first time a limit is exceeded. In the present implementation, only one limits-exceeded events will be added, so as to prevent a malicious or malformed sample from filling the JSON buffer with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
fmap_t *map = ctx->fmap;
2016-06-21 17:22:52 -04:00
size_t length, offset, size, at;
2013-09-13 12:22:58 -04:00
int encoding;
z_stream strm;
char *toc, *tmpname = NULL;
2013-09-13 12:22:58 -04:00
xmlTextReaderPtr reader = NULL;
int a_hash, e_hash;
2013-10-11 12:04:29 -04:00
unsigned char *a_cksum = NULL, *e_cksum = NULL;
2014-10-07 12:39:12 -04:00
void *a_hash_ctx = NULL, *e_hash_ctx = NULL;
2020-08-27 20:42:43 -07:00
char e_hash_result[SHA1_HASH_SIZE];
char a_hash_result[SHA1_HASH_SIZE];
2013-09-13 12:22:58 -04:00
memset(&strm, 0x00, sizeof(z_stream));
2013-09-13 12:22:58 -04:00
/* retrieve xar header */
libclamav: Fix scan recursion tracking Scan recursion is the process of identifying files embedded in other files and then scanning them, recursively. Internally this process is more complex than it may sound because a file may have multiple layers of types before finding a new "file". At present we treat the recursion count in the scanning context as an index into both our fmap list AND our container list. These two lists are conceptually a part of the same thing and should be unified. But what's concerning is that the "recursion level" isn't actually incremented or decremented at the same time that we add a layer to the fmap or container lists but instead is more touchy-feely, increasing when we find a new "file". To account for this shadiness, the size of the fmap and container lists has always been a little longer than our "max scan recursion" limit so we don't accidentally overflow the fmap or container arrays (!). I've implemented a single recursion-stack as an array, similar to before, which includes a pointer to each fmap at each layer, along with the size and type. Push and pop functions add and remove layers whenever a new fmap is added. A boolean argument when pushing indicates if the new layer represents a new buffer or new file (descriptor). A new buffer will reset the "nested fmap level" (described below). This commit also provides a solution for an issue where we detect embedded files more than once during scan recursion. For illustration, imagine a tarball named foo.tar.gz with this structure: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.tar.gz | GZ | 0 | 0 | | └── foo.tar | TAR | 1 | 0 | | ├── bar.zip | ZIP | 2 | 1 | | │   └── hola.txt | ASCII | 3 | 0 | | └── baz.exe | PE | 2 | 1 | But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | baz.exe | PE | 0 | 0 | | ├── sfx.zip | ZIP | 1 | 1 | | │   └── hello.txt | ASCII | 2 | 0 | | └── sfx.7z | 7Z | 1 | 1 | |    └── world.txt | ASCII | 2 | 0 | (A) If we scan for embedded files at any layer, we may detect: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.tar.gz | GZ | 0 | 0 | | ├── foo.tar | TAR | 1 | 0 | | │ ├── bar.zip | ZIP | 2 | 1 | | │ │   └── hola.txt | ASCII | 3 | 0 | | │ ├── baz.exe | PE | 2 | 1 | | │ │ ├── sfx.zip | ZIP | 3 | 1 | | │ │ │   └── hello.txt | ASCII | 4 | 0 | | │ │ └── sfx.7z | 7Z | 3 | 1 | | │ │    └── world.txt | ASCII | 4 | 0 | | │ ├── sfx.zip | ZIP | 2 | 1 | | │ │   └── hello.txt | ASCII | 3 | 0 | | │ └── sfx.7z | 7Z | 2 | 1 | | │   └── world.txt | ASCII | 3 | 0 | | ├── sfx.zip | ZIP | 1 | 1 | | └── sfx.7z | 7Z | 1 | 1 | (A) is bad because it scans content more than once. Note that for the GZ layer, it may detect the ZIP and 7Z if the signature hits on the compressed data, which it might, though extracting the ZIP and 7Z will likely fail. The reason the above doesn't happen now is that we restrict embedded type scans for a bunch of archive formats to include GZ and TAR. (B) If we scan for embedded files at the foo.tar layer, we may detect: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.tar.gz | GZ | 0 | 0 | | └── foo.tar | TAR | 1 | 0 | | ├── bar.zip | ZIP | 2 | 1 | | │   └── hola.txt | ASCII | 3 | 0 | | ├── baz.exe | PE | 2 | 1 | | ├── sfx.zip | ZIP | 2 | 1 | | │   └── hello.txt | ASCII | 3 | 0 | | └── sfx.7z | 7Z | 2 | 1 | |    └── world.txt | ASCII | 3 | 0 | (B) is almost right. But we can achieve it easily enough only scanning for embedded content in the current fmap when the "nested fmap level" is 0. The upside is that it should safely detect all embedded content, even if it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe. The biggest risk I can think of affects ZIPs. SFXZIP detection is identical to ZIP detection, which is why we don't allow SFXZIP to be detected if insize of a ZIP. If we only allow embedded type scanning at fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP if the bar.exe was not compressed in foo.zip and if non-compressed files extracted from ZIPs aren't extracted as new buffers: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.zip | ZIP | 0 | 0 | | └── bar.exe | PE | 1 | 1 | | └── sfx.zip | ZIP | 2 | 2 | Provided that we ensure all files extracted from zips are scanned in new buffers, option (B) should be safe. (C) If we scan for embedded files at the baz.exe layer, we may detect: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.tar.gz | GZ | 0 | 0 | | └── foo.tar | TAR | 1 | 0 | | ├── bar.zip | ZIP | 2 | 1 | | │   └── hola.txt | ASCII | 3 | 0 | | └── baz.exe | PE | 2 | 1 | | ├── sfx.zip | ZIP | 3 | 1 | | │   └── hello.txt | ASCII | 4 | 0 | | └── sfx.7z | 7Z | 3 | 1 | |    └── world.txt | ASCII | 4 | 0 | (C) is right. But it's harder to achieve. For this example we can get it by restricting 7ZSFX and ZIPSFX detection only when scanning an executable. But that may mean losing detection of archives embedded elsewhere. And we'd have to identify allowable container types for each possible embedded type, which would be very difficult. So this commit aims to solve the issue the (B)-way. Note that in all situations, we still have to scan with file typing enabled to determine if we need to reassign the current file type, such as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2- compressed. Detection of DMG and a handful of other types rely on finding data partway through or near the ned of a file before reassigning the entire file as the new type. Other fixes and considerations in this commit: - The utf16 HTML parser has weak error handling, particularly with respect to creating a nested fmap for scanning the ascii decoded file. This commit cleans up the error handling and wraps the nested scan with the recursion-stack push()/pop() for correct recursion tracking. Before this commit, each container layer had a flag to indicate if the container layer is valid. We need something similar so that the cli_recursion_stack_get_*() functions ignore normalized layers. Details... Imagine an LDB signature for HTML content that specifies a ZIP container. If the signature actually alerts on the normalized HTML and you don't ignore normalized layers for the container check, it will appear as though the alert is in an HTML container rather than a ZIP container. This commit accomplishes this with a boolean you set in the scan context before scanning a new layer. Then when the new fmap is created, it will use that flag to set similar flag for the layer. The context flag is reset those that anything after this doesn't have that flag. The flag allows the new recursion_stack_get() function to ignore normalized layers when iterating the stack to return a layer at a requested index, negative or positive. Scanning normalized extracted/normalized javascript and VBA should also use the 'layer is normalized' flag. - This commit also fixes Heuristic.Broken.Executable alert for ELF files to make sure that: A) these only alert if cli_append_virus() returns CL_VIRUS (aka it respects the FP check). B) all broken-executable alerts for ELF only happen if the SCAN_HEURISTIC_BROKEN option is enabled. - This commit also cleans up the error handling in cli_magic_scan_dir(). This was needed so we could correctly apply the layer-is-normalized-flag to all VBA macros extracted to a directory when scanning the directory. - Also fix an issue where exceeding scan maximums wouldn't cause embedded file detection scans to abort. Granted we don't actually want to abort if max filesize or max recursion depth are exceeded... only if max scansize, max files, and max scantime are exceeded. Add 'abort_scan' flag to scan context, to protect against depending on correct error propagation for fatal conditions. Instead, setting this flag in the scan context should guarantee that a fatal condition deep in scan recursion isn't lost which result in more stuff being scanned instead of aborting. This shouldn't be necessary, but some status codes like CL_ETIMEOUT never used to be fatal and it's easier to do this than to verify every parser only returns CL_ETIMEOUT and other "fatal status codes" in fatal conditions. - Remove duplicate is_tar() prototype from filestypes.c and include is_tar.h instead. - Presently we create the fmap hash when creating the fmap. This wastes a bit of CPU if the hash is never needed. Now that we're creating fmap's for all embedded files discovered with file type recognition scans, this is a much more frequent occurence and really slows things down. This commit fixes the issue by only creating fmap hashes as needed. This should not only resolve the perfomance impact of creating fmap's for all embedded files, but also should improve performance in general. - Add allmatch check to the zip parser after the central-header meta match. That way we don't multiple alerts with the same match except in allmatch mode. Clean up error handling in the zip parser a tiny bit. - Fixes to ensure that the scan limits such as scansize, filesize, recursion depth, # of embedded files, and scantime are always reported if AlertExceedsMax (--alert-exceeds-max) is enabled. - Fixed an issue where non-fatal alerts for exceeding scan maximums may mask signature matches later on. I changed it so these alerts use the "possibly unwanted" alert-type and thus only alert if no other alerts were found or if all-match or heuristic-precedence are enabled. - Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata when the --gen-json feature is enabled. These will show up once under "ParseErrors" the first time a limit is exceeded. In the present implementation, only one limits-exceeded events will be added, so as to prevent a malicious or malformed sample from filling the JSON buffer with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
if (fmap_readn(ctx->fmap, &hdr, 0, sizeof(hdr)) != sizeof(hdr)) {
cli_dbgmsg("cli_scanxar: Invalid header, too short.\n");
return CL_EFORMAT;
}
hdr.magic = be32_to_host(hdr.magic);
2013-09-13 12:22:58 -04:00
if (hdr.magic == XAR_HEADER_MAGIC) {
cli_dbgmsg("cli_scanxar: Matched magic\n");
} else {
cli_dbgmsg("cli_scanxar: Invalid magic\n");
return CL_EFORMAT;
}
hdr.size = be16_to_host(hdr.size);
hdr.version = be16_to_host(hdr.version);
hdr.toc_length_compressed = be64_to_host(hdr.toc_length_compressed);
2013-09-13 12:22:58 -04:00
hdr.toc_length_decompressed = be64_to_host(hdr.toc_length_decompressed);
hdr.chksum_alg = be32_to_host(hdr.chksum_alg);
2013-09-13 12:22:58 -04:00
/* cli_dbgmsg("hdr.magic %x\n", hdr.magic); */
/* cli_dbgmsg("hdr.size %i\n", hdr.size); */
/* cli_dbgmsg("hdr.version %i\n", hdr.version); */
/* cli_dbgmsg("hdr.toc_length_compressed %lu\n", hdr.toc_length_compressed); */
/* cli_dbgmsg("hdr.toc_length_decompressed %lu\n", hdr.toc_length_decompressed); */
/* cli_dbgmsg("hdr.chksum_alg %i\n", hdr.chksum_alg); */
2013-09-13 12:22:58 -04:00
/* Uncompress TOC */
libclamav: Fix scan recursion tracking Scan recursion is the process of identifying files embedded in other files and then scanning them, recursively. Internally this process is more complex than it may sound because a file may have multiple layers of types before finding a new "file". At present we treat the recursion count in the scanning context as an index into both our fmap list AND our container list. These two lists are conceptually a part of the same thing and should be unified. But what's concerning is that the "recursion level" isn't actually incremented or decremented at the same time that we add a layer to the fmap or container lists but instead is more touchy-feely, increasing when we find a new "file". To account for this shadiness, the size of the fmap and container lists has always been a little longer than our "max scan recursion" limit so we don't accidentally overflow the fmap or container arrays (!). I've implemented a single recursion-stack as an array, similar to before, which includes a pointer to each fmap at each layer, along with the size and type. Push and pop functions add and remove layers whenever a new fmap is added. A boolean argument when pushing indicates if the new layer represents a new buffer or new file (descriptor). A new buffer will reset the "nested fmap level" (described below). This commit also provides a solution for an issue where we detect embedded files more than once during scan recursion. For illustration, imagine a tarball named foo.tar.gz with this structure: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.tar.gz | GZ | 0 | 0 | | └── foo.tar | TAR | 1 | 0 | | ├── bar.zip | ZIP | 2 | 1 | | │   └── hola.txt | ASCII | 3 | 0 | | └── baz.exe | PE | 2 | 1 | But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | baz.exe | PE | 0 | 0 | | ├── sfx.zip | ZIP | 1 | 1 | | │   └── hello.txt | ASCII | 2 | 0 | | └── sfx.7z | 7Z | 1 | 1 | |    └── world.txt | ASCII | 2 | 0 | (A) If we scan for embedded files at any layer, we may detect: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.tar.gz | GZ | 0 | 0 | | ├── foo.tar | TAR | 1 | 0 | | │ ├── bar.zip | ZIP | 2 | 1 | | │ │   └── hola.txt | ASCII | 3 | 0 | | │ ├── baz.exe | PE | 2 | 1 | | │ │ ├── sfx.zip | ZIP | 3 | 1 | | │ │ │   └── hello.txt | ASCII | 4 | 0 | | │ │ └── sfx.7z | 7Z | 3 | 1 | | │ │    └── world.txt | ASCII | 4 | 0 | | │ ├── sfx.zip | ZIP | 2 | 1 | | │ │   └── hello.txt | ASCII | 3 | 0 | | │ └── sfx.7z | 7Z | 2 | 1 | | │   └── world.txt | ASCII | 3 | 0 | | ├── sfx.zip | ZIP | 1 | 1 | | └── sfx.7z | 7Z | 1 | 1 | (A) is bad because it scans content more than once. Note that for the GZ layer, it may detect the ZIP and 7Z if the signature hits on the compressed data, which it might, though extracting the ZIP and 7Z will likely fail. The reason the above doesn't happen now is that we restrict embedded type scans for a bunch of archive formats to include GZ and TAR. (B) If we scan for embedded files at the foo.tar layer, we may detect: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.tar.gz | GZ | 0 | 0 | | └── foo.tar | TAR | 1 | 0 | | ├── bar.zip | ZIP | 2 | 1 | | │   └── hola.txt | ASCII | 3 | 0 | | ├── baz.exe | PE | 2 | 1 | | ├── sfx.zip | ZIP | 2 | 1 | | │   └── hello.txt | ASCII | 3 | 0 | | └── sfx.7z | 7Z | 2 | 1 | |    └── world.txt | ASCII | 3 | 0 | (B) is almost right. But we can achieve it easily enough only scanning for embedded content in the current fmap when the "nested fmap level" is 0. The upside is that it should safely detect all embedded content, even if it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe. The biggest risk I can think of affects ZIPs. SFXZIP detection is identical to ZIP detection, which is why we don't allow SFXZIP to be detected if insize of a ZIP. If we only allow embedded type scanning at fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP if the bar.exe was not compressed in foo.zip and if non-compressed files extracted from ZIPs aren't extracted as new buffers: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.zip | ZIP | 0 | 0 | | └── bar.exe | PE | 1 | 1 | | └── sfx.zip | ZIP | 2 | 2 | Provided that we ensure all files extracted from zips are scanned in new buffers, option (B) should be safe. (C) If we scan for embedded files at the baz.exe layer, we may detect: | description | type | rec level | nested fmap level | | ------------------------- | ----- | --------- | ----------------- | | foo.tar.gz | GZ | 0 | 0 | | └── foo.tar | TAR | 1 | 0 | | ├── bar.zip | ZIP | 2 | 1 | | │   └── hola.txt | ASCII | 3 | 0 | | └── baz.exe | PE | 2 | 1 | | ├── sfx.zip | ZIP | 3 | 1 | | │   └── hello.txt | ASCII | 4 | 0 | | └── sfx.7z | 7Z | 3 | 1 | |    └── world.txt | ASCII | 4 | 0 | (C) is right. But it's harder to achieve. For this example we can get it by restricting 7ZSFX and ZIPSFX detection only when scanning an executable. But that may mean losing detection of archives embedded elsewhere. And we'd have to identify allowable container types for each possible embedded type, which would be very difficult. So this commit aims to solve the issue the (B)-way. Note that in all situations, we still have to scan with file typing enabled to determine if we need to reassign the current file type, such as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2- compressed. Detection of DMG and a handful of other types rely on finding data partway through or near the ned of a file before reassigning the entire file as the new type. Other fixes and considerations in this commit: - The utf16 HTML parser has weak error handling, particularly with respect to creating a nested fmap for scanning the ascii decoded file. This commit cleans up the error handling and wraps the nested scan with the recursion-stack push()/pop() for correct recursion tracking. Before this commit, each container layer had a flag to indicate if the container layer is valid. We need something similar so that the cli_recursion_stack_get_*() functions ignore normalized layers. Details... Imagine an LDB signature for HTML content that specifies a ZIP container. If the signature actually alerts on the normalized HTML and you don't ignore normalized layers for the container check, it will appear as though the alert is in an HTML container rather than a ZIP container. This commit accomplishes this with a boolean you set in the scan context before scanning a new layer. Then when the new fmap is created, it will use that flag to set similar flag for the layer. The context flag is reset those that anything after this doesn't have that flag. The flag allows the new recursion_stack_get() function to ignore normalized layers when iterating the stack to return a layer at a requested index, negative or positive. Scanning normalized extracted/normalized javascript and VBA should also use the 'layer is normalized' flag. - This commit also fixes Heuristic.Broken.Executable alert for ELF files to make sure that: A) these only alert if cli_append_virus() returns CL_VIRUS (aka it respects the FP check). B) all broken-executable alerts for ELF only happen if the SCAN_HEURISTIC_BROKEN option is enabled. - This commit also cleans up the error handling in cli_magic_scan_dir(). This was needed so we could correctly apply the layer-is-normalized-flag to all VBA macros extracted to a directory when scanning the directory. - Also fix an issue where exceeding scan maximums wouldn't cause embedded file detection scans to abort. Granted we don't actually want to abort if max filesize or max recursion depth are exceeded... only if max scansize, max files, and max scantime are exceeded. Add 'abort_scan' flag to scan context, to protect against depending on correct error propagation for fatal conditions. Instead, setting this flag in the scan context should guarantee that a fatal condition deep in scan recursion isn't lost which result in more stuff being scanned instead of aborting. This shouldn't be necessary, but some status codes like CL_ETIMEOUT never used to be fatal and it's easier to do this than to verify every parser only returns CL_ETIMEOUT and other "fatal status codes" in fatal conditions. - Remove duplicate is_tar() prototype from filestypes.c and include is_tar.h instead. - Presently we create the fmap hash when creating the fmap. This wastes a bit of CPU if the hash is never needed. Now that we're creating fmap's for all embedded files discovered with file type recognition scans, this is a much more frequent occurence and really slows things down. This commit fixes the issue by only creating fmap hashes as needed. This should not only resolve the perfomance impact of creating fmap's for all embedded files, but also should improve performance in general. - Add allmatch check to the zip parser after the central-header meta match. That way we don't multiple alerts with the same match except in allmatch mode. Clean up error handling in the zip parser a tiny bit. - Fixes to ensure that the scan limits such as scansize, filesize, recursion depth, # of embedded files, and scantime are always reported if AlertExceedsMax (--alert-exceeds-max) is enabled. - Fixed an issue where non-fatal alerts for exceeding scan maximums may mask signature matches later on. I changed it so these alerts use the "possibly unwanted" alert-type and thus only alert if no other alerts were found or if all-match or heuristic-precedence are enabled. - Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata when the --gen-json feature is enabled. These will show up once under "ParseErrors" the first time a limit is exceeded. In the present implementation, only one limits-exceeded events will be added, so as to prevent a malicious or malformed sample from filling the JSON buffer with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
strm.next_in = (unsigned char *)fmap_need_off_once(ctx->fmap, hdr.size, hdr.toc_length_compressed);
2013-09-13 12:22:58 -04:00
if (strm.next_in == NULL) {
cli_dbgmsg("cli_scanxar: fmap_need_off_once fails on TOC.\n");
return CL_EREAD;
2013-09-13 12:22:58 -04:00
}
strm.avail_in = hdr.toc_length_compressed;
toc = cli_max_malloc(hdr.toc_length_decompressed + 1);
2013-09-13 12:22:58 -04:00
if (toc == NULL) {
cli_dbgmsg("cli_scanxar: cli_max_malloc fails on TOC decompress buffer.\n");
2013-09-13 12:22:58 -04:00
return CL_EMEM;
}
toc[hdr.toc_length_decompressed] = '\0';
strm.avail_out = hdr.toc_length_decompressed;
strm.next_out = (unsigned char *)toc;
rc = inflateInit(&strm);
2013-09-13 12:22:58 -04:00
if (rc != Z_OK) {
cli_dbgmsg("cli_scanxar:inflateInit error %i \n", rc);
2013-09-13 12:22:58 -04:00
rc = CL_EFORMAT;
goto exit_toc;
}
2013-09-13 12:22:58 -04:00
rc = inflate(&strm, Z_SYNC_FLUSH);
if (rc != Z_OK && rc != Z_STREAM_END) {
inflateEnd(&strm);
cli_dbgmsg("cli_scanxar:inflate error %i \n", rc);
2013-09-13 12:22:58 -04:00
rc = CL_EFORMAT;
goto exit_toc;
}
rc = inflateEnd(&strm);
if (rc != Z_OK) {
cli_dbgmsg("cli_scanxar:inflateEnd error %i \n", rc);
2013-09-13 12:22:58 -04:00
rc = CL_EFORMAT;
goto exit_toc;
}
2016-06-21 17:22:52 -04:00
if (hdr.toc_length_decompressed != strm.total_out) {
cli_dbgmsg("TOC decompress length %" PRIu64 " does not match amount decompressed %lu\n",
hdr.toc_length_decompressed, strm.total_out);
toc[strm.total_out] = '\0';
2016-06-21 17:22:52 -04:00
hdr.toc_length_decompressed = strm.total_out;
}
/* cli_dbgmsg("cli_scanxar: TOC xml:\n%s\n", toc); */
/* printf("cli_scanxar: TOC xml:\n%s\n", toc); */
/* cli_dbgmsg("cli_scanxar: TOC end:\n"); */
/* printf("cli_scanxar: TOC end:\n"); */
2013-09-13 12:22:58 -04:00
/* scan the xml */
cli_dbgmsg("cli_scanxar: scanning xar TOC xml in memory.\n");
rc = cli_magic_scan_buff(toc, hdr.toc_length_decompressed, ctx, NULL, LAYER_ATTRIBUTES_NONE);
2013-09-13 12:22:58 -04:00
if (rc != CL_SUCCESS) {
goto exit_toc;
2013-09-13 12:22:58 -04:00
}
/* make a file to leave if --leave-temps in effect */
if (ctx->engine->keeptmp) {
libclamav: Add engine option to toggle temp directory recursion Temp directory recursion in ClamAV is when each layer of a scan gets its own temp directory in the parent layer's temp directory. In addition to temp directory recursion, ClamAV has been creating a new subdirectory for each file scan as a risk-adverse method to ensure no temporary file leaks fill up the disk. Creating a directory is relatively slow on Windows in particular if scanning a lot of very small files. This commit: 1. Separates the temp directory recursion feature from the leave-temps feature so that libclamav can leave temp files without making subdirectories for each file scanned. 2. Makes it so that when temp directory recursion is off, libclamav will just use the configure temp directory for all files. The new option to enable temp directory recursion is for libclamav-only at this time. It is off by default, and you can enable it like this: ```c cl_engine_set_num(engine, CL_ENGINE_TMPDIR_RECURSION, 1); ``` For the `clamscan` and `clamd` programs, temp directory recursion will be enabled when `--leave-temps` / `LeaveTemporaryFiles` is enabled. The difference is that when disabled, it will return to using the configured temp directory without making a subdirectory for each file scanned, so as to improve scan performance for small files, mostly on Windows. Under the hood, this commit also: 1. Cleans up how we keep track of tmpdirs for each layer. The goal here is to align how we keep track of layer-specific stuff using the scan_layer structure. 2. Cleans up how we record metadata JSON for embedded files. Note: Embedded files being different from Contained files, as they are extracted not with a parser, but by finding them with file type magic signatures. CLAM-1583
2025-06-09 20:42:31 -04:00
if ((rc = cli_gentempfd(ctx->this_layer_tmpdir, &tmpname, &fd)) != CL_SUCCESS) {
cli_dbgmsg("cli_scanxar: Can't create temporary file for TOC.\n");
2013-09-13 12:22:58 -04:00
goto exit_toc;
}
if (cli_writen(fd, toc, hdr.toc_length_decompressed) == (size_t)-1) {
cli_dbgmsg("cli_scanxar: cli_writen error writing TOC.\n");
2013-09-13 12:22:58 -04:00
rc = CL_EWRITE;
xar_cleanup_temp_file(ctx, fd, tmpname);
goto exit_toc;
}
rc = xar_cleanup_temp_file(ctx, fd, tmpname);
tmpname = NULL;
2013-09-13 12:22:58 -04:00
if (rc != CL_SUCCESS)
goto exit_toc;
}
reader = xmlReaderForMemory(toc, hdr.toc_length_decompressed, "noname.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
2013-09-13 12:22:58 -04:00
if (reader == NULL) {
cli_dbgmsg("cli_scanxar: xmlReaderForMemory error for TOC\n");
2013-09-13 12:22:58 -04:00
goto exit_toc;
}
rc = xar_scan_subdocuments(reader, ctx);
if (rc != CL_SUCCESS) {
cli_dbgmsg("xar_scan_subdocuments returns %i.\n", rc);
2013-12-04 09:57:05 -05:00
goto exit_reader;
}
2013-09-13 12:22:58 -04:00
/* Walk the TOC XML and extract files */
fd = -1;
2013-09-13 12:22:58 -04:00
tmpname = NULL;
while (CL_SUCCESS == (rc = xar_get_toc_data_values(reader, &length, &offset, &size, &encoding,
&a_cksum, &a_hash, &e_cksum, &e_hash))) {
int do_extract_cksum = 1;
unsigned char *blockp;
void *a_sc, *e_sc;
void *a_mc, *e_mc;
char *expected;
2013-09-13 12:22:58 -04:00
/* clean up temp file from previous loop iteration */
if (fd > -1 && tmpname) {
rc = xar_cleanup_temp_file(ctx, fd, tmpname);
tmpname = NULL;
2013-09-13 12:22:58 -04:00
if (rc != CL_SUCCESS)
goto exit_reader;
}
at = offset + hdr.toc_length_compressed + hdr.size;
libclamav: Add engine option to toggle temp directory recursion Temp directory recursion in ClamAV is when each layer of a scan gets its own temp directory in the parent layer's temp directory. In addition to temp directory recursion, ClamAV has been creating a new subdirectory for each file scan as a risk-adverse method to ensure no temporary file leaks fill up the disk. Creating a directory is relatively slow on Windows in particular if scanning a lot of very small files. This commit: 1. Separates the temp directory recursion feature from the leave-temps feature so that libclamav can leave temp files without making subdirectories for each file scanned. 2. Makes it so that when temp directory recursion is off, libclamav will just use the configure temp directory for all files. The new option to enable temp directory recursion is for libclamav-only at this time. It is off by default, and you can enable it like this: ```c cl_engine_set_num(engine, CL_ENGINE_TMPDIR_RECURSION, 1); ``` For the `clamscan` and `clamd` programs, temp directory recursion will be enabled when `--leave-temps` / `LeaveTemporaryFiles` is enabled. The difference is that when disabled, it will return to using the configured temp directory without making a subdirectory for each file scanned, so as to improve scan performance for small files, mostly on Windows. Under the hood, this commit also: 1. Cleans up how we keep track of tmpdirs for each layer. The goal here is to align how we keep track of layer-specific stuff using the scan_layer structure. 2. Cleans up how we record metadata JSON for embedded files. Note: Embedded files being different from Contained files, as they are extracted not with a parser, but by finding them with file type magic signatures. CLAM-1583
2025-06-09 20:42:31 -04:00
if ((rc = cli_gentempfd(ctx->this_layer_tmpdir, &tmpname, &fd)) != CL_SUCCESS) {
cli_dbgmsg("cli_scanxar: Can't generate temporary file.\n");
2013-09-13 12:22:58 -04:00
goto exit_reader;
}
2016-06-21 17:22:52 -04:00
cli_dbgmsg("cli_scanxar: decompress into temp file:\n%s, size %zu,\n"
"from xar heap offset %zu length %zu\n",
tmpname, size, offset, length);
2013-09-13 12:22:58 -04:00
a_hash_ctx = xar_hash_init(a_hash, &a_sc, &a_mc);
e_hash_ctx = xar_hash_init(e_hash, &e_sc, &e_mc);
2013-09-13 12:22:58 -04:00
switch (encoding) {
case CL_TYPE_GZ:
/* inflate gzip directly because file segments do not contain magic */
memset(&strm, 0, sizeof(strm));
if ((rc = inflateInit(&strm)) != Z_OK) {
cli_dbgmsg("cli_scanxar: InflateInit failed: %d\n", rc);
rc = CL_EFORMAT;
extract_errors++;
break;
2013-09-13 12:22:58 -04:00
}
while ((size_t)at < map->len && (unsigned long)at < offset + hdr.toc_length_compressed + hdr.size + length) {
unsigned long avail_in;
void *next_in;
unsigned int bytes = MIN(map->len - at, map->pgsz);
bytes = MIN(length, bytes);
if (!(strm.next_in = next_in = (void *)fmap_need_off_once(map, at, bytes))) {
cli_dbgmsg("cli_scanxar: Can't read %u bytes @ %lu.\n", bytes, (long unsigned)at);
2013-09-13 12:22:58 -04:00
inflateEnd(&strm);
rc = CL_EREAD;
2013-09-13 12:22:58 -04:00
goto exit_tmpfile;
}
at += bytes;
strm.avail_in = avail_in = bytes;
do {
int inf, outsize = 0;
unsigned char buff[FILEBUFF];
strm.avail_out = sizeof(buff);
strm.next_out = buff;
inf = inflate(&strm, Z_SYNC_FLUSH);
if (inf != Z_OK && inf != Z_STREAM_END && inf != Z_BUF_ERROR) {
cli_dbgmsg("cli_scanxar: inflate error %i %s.\n", inf, strm.msg ? strm.msg : "");
rc = CL_EFORMAT;
extract_errors++;
break;
}
bytes = sizeof(buff) - strm.avail_out;
if (e_hash_ctx != NULL)
xar_hash_update(e_hash_ctx, buff, bytes, e_hash);
if (cli_writen(fd, buff, bytes) == (size_t)-1) {
cli_dbgmsg("cli_scanxar: cli_writen error file %s.\n", tmpname);
inflateEnd(&strm);
rc = CL_EWRITE;
goto exit_tmpfile;
}
outsize += sizeof(buff) - strm.avail_out;
if (cli_checklimits("cli_scanxar", ctx, outsize, 0, 0) != CL_CLEAN) {
break;
}
if (inf == Z_STREAM_END) {
break;
}
} while (strm.avail_out == 0);
if (rc != CL_SUCCESS)
2013-09-13 12:22:58 -04:00
break;
avail_in -= strm.avail_in;
if (a_hash_ctx != NULL)
xar_hash_update(a_hash_ctx, next_in, avail_in, a_hash);
}
inflateEnd(&strm);
break;
case CL_TYPE_7Z:
#define CLI_LZMA_OBUF_SIZE 1024 * 1024
#define CLI_LZMA_HDR_SIZE LZMA_PROPS_SIZE + 8
#define CLI_LZMA_IBUF_SIZE CLI_LZMA_OBUF_SIZE >> 2 /* estimated compression ratio 25% */
2013-10-08 17:17:44 -04:00
{
2013-10-11 12:04:29 -04:00
struct CLI_LZMA lz;
2016-06-21 17:22:52 -04:00
unsigned long in_remaining = MIN(length, map->len - at);
unsigned long out_size = 0;
unsigned char *buff = __lzma_wrap_alloc(NULL, CLI_LZMA_OBUF_SIZE);
int lret;
2016-06-21 17:22:52 -04:00
if (length > in_remaining)
length = in_remaining;
2013-10-11 12:04:29 -04:00
memset(&lz, 0, sizeof(lz));
if (buff == NULL) {
cli_dbgmsg("cli_scanxar: memory request for lzma decompression buffer fails.\n");
rc = CL_EMEM;
goto exit_tmpfile;
}
blockp = (void *)fmap_need_off_once(map, at, CLI_LZMA_HDR_SIZE);
if (blockp == NULL) {
char errbuff[128];
cli_strerror(errno, errbuff, sizeof(errbuff));
cli_dbgmsg("cli_scanxar: Can't read %i bytes @ %zu, errno:%s.\n",
2016-06-21 17:22:52 -04:00
CLI_LZMA_HDR_SIZE, at, errbuff);
rc = CL_EREAD;
__lzma_wrap_free(NULL, buff);
goto exit_tmpfile;
}
lz.next_in = blockp;
lz.avail_in = CLI_LZMA_HDR_SIZE;
if (a_hash_ctx != NULL)
xar_hash_update(a_hash_ctx, blockp, CLI_LZMA_HDR_SIZE, a_hash);
lret = cli_LzmaInit(&lz, 0);
if (lret != LZMA_RESULT_OK) {
cli_dbgmsg("cli_scanxar: cli_LzmaInit() fails: %i.\n", lret);
rc = CL_EFORMAT;
__lzma_wrap_free(NULL, buff);
extract_errors++;
break;
}
at += CLI_LZMA_HDR_SIZE;
in_remaining -= CLI_LZMA_HDR_SIZE;
while (at < map->len && at < offset + (size_t)hdr.toc_length_compressed + (size_t)hdr.size + length) {
SizeT avail_in;
SizeT avail_out;
void *next_in;
unsigned long in_consumed;
lz.next_out = buff;
lz.avail_out = CLI_LZMA_OBUF_SIZE;
2013-10-08 17:17:44 -04:00
lz.avail_in = avail_in = MIN(CLI_LZMA_IBUF_SIZE, in_remaining);
lz.next_in = next_in = (void *)fmap_need_off_once(map, at, lz.avail_in);
if (lz.next_in == NULL) {
char errbuff[128];
cli_strerror(errno, errbuff, sizeof(errbuff));
cli_dbgmsg("cli_scanxar: Can't read %zu bytes @ %zu, errno: %s.\n",
2016-06-21 17:22:52 -04:00
lz.avail_in, at, errbuff);
rc = CL_EREAD;
__lzma_wrap_free(NULL, buff);
cli_LzmaShutdown(&lz);
goto exit_tmpfile;
}
lret = cli_LzmaDecode(&lz);
if (lret != LZMA_RESULT_OK && lret != LZMA_STREAM_END) {
cli_dbgmsg("cli_scanxar: cli_LzmaDecode() fails: %i.\n", lret);
rc = CL_EFORMAT;
extract_errors++;
break;
}
in_consumed = avail_in - lz.avail_in;
in_remaining -= in_consumed;
at += in_consumed;
avail_out = CLI_LZMA_OBUF_SIZE - lz.avail_out;
if (avail_out == 0)
cli_dbgmsg("cli_scanxar: cli_LzmaDecode() produces no output for "
2016-02-22 13:26:15 -05:00
"avail_in %llu, avail_out %llu.\n",
(long long unsigned)avail_in, (long long unsigned)avail_out);
if (a_hash_ctx != NULL)
xar_hash_update(a_hash_ctx, next_in, in_consumed, a_hash);
if (e_hash_ctx != NULL)
xar_hash_update(e_hash_ctx, buff, avail_out, e_hash);
/* Write a decompressed block. */
/* cli_dbgmsg("Writing %li bytes to LZMA decompress temp file, " */
/* "consumed %li of %li available compressed bytes.\n", */
/* avail_out, in_consumed, avail_in); */
if (cli_writen(fd, buff, avail_out) == (size_t)-1) {
2016-02-22 13:26:15 -05:00
cli_dbgmsg("cli_scanxar: cli_writen error writing lzma temp file for %llu bytes.\n",
(long long unsigned)avail_out);
__lzma_wrap_free(NULL, buff);
cli_LzmaShutdown(&lz);
rc = CL_EWRITE;
goto exit_tmpfile;
}
/* Check file size limitation. */
out_size += avail_out;
if (cli_checklimits("cli_scanxar", ctx, out_size, 0, 0) != CL_CLEAN) {
break;
}
if (lret == LZMA_STREAM_END)
break;
}
cli_LzmaShutdown(&lz);
__lzma_wrap_free(NULL, buff);
} break;
case CL_TYPE_ANY:
default:
case CL_TYPE_BZ:
case CL_TYPE_XZ:
/* for uncompressed, bzip2, xz, and unknown, just pull the file, cli_magic_scan_desc does the rest */
do_extract_cksum = 0;
{
size_t writelen = MIN(map->len - at, length);
2016-06-21 17:22:52 -04:00
if (ctx->engine->maxfilesize)
writelen = MIN((size_t)(ctx->engine->maxfilesize), writelen);
if (!(blockp = (void *)fmap_need_off_once(map, at, writelen))) {
char errbuff[128];
cli_strerror(errno, errbuff, sizeof(errbuff));
cli_dbgmsg("cli_scanxar: Can't read %zu bytes @ %zu, errno:%s.\n",
writelen, at, errbuff);
rc = CL_EREAD;
goto exit_tmpfile;
}
if (a_hash_ctx != NULL)
xar_hash_update(a_hash_ctx, blockp, writelen, a_hash);
if (cli_writen(fd, blockp, writelen) == (size_t)-1) {
cli_dbgmsg("cli_scanxar: cli_writen error %zu bytes @ %zu.\n", writelen, at);
rc = CL_EWRITE;
goto exit_tmpfile;
}
/*break;*/
}
2016-06-21 17:22:52 -04:00
} /* end of switch */
if (a_hash_ctx != NULL) {
2020-08-27 20:42:43 -07:00
xar_hash_final(a_hash_ctx, a_hash_result, a_hash);
a_hash_ctx = NULL;
} else if (rc == CL_SUCCESS) {
cli_dbgmsg("cli_scanxar: archived-checksum missing.\n");
cksum_fails++;
}
if (e_hash_ctx != NULL) {
2020-08-27 20:42:43 -07:00
xar_hash_final(e_hash_ctx, e_hash_result, e_hash);
e_hash_ctx = NULL;
} else if (rc == CL_SUCCESS) {
cli_dbgmsg("cli_scanxar: extracted-checksum(unarchived-checksum) missing.\n");
cksum_fails++;
}
if (rc == CL_SUCCESS) {
if (a_cksum != NULL) {
expected = cli_hex2str((char *)a_cksum);
2020-08-27 20:42:43 -07:00
if (xar_hash_check(a_hash, a_hash_result, expected) != 0) {
cli_dbgmsg("cli_scanxar: archived-checksum mismatch.\n");
cksum_fails++;
} else {
cli_dbgmsg("cli_scanxar: archived-checksum matched.\n");
}
free(expected);
2013-09-13 12:22:58 -04:00
}
if (e_cksum != NULL) {
if (do_extract_cksum) {
expected = cli_hex2str((char *)e_cksum);
2020-08-27 20:42:43 -07:00
if (xar_hash_check(e_hash, e_hash_result, expected) != 0) {
cli_dbgmsg("cli_scanxar: extracted-checksum mismatch.\n");
cksum_fails++;
} else {
cli_dbgmsg("cli_scanxar: extracted-checksum matched.\n");
}
free(expected);
}
}
rc = cli_magic_scan_desc(fd, tmpname, ctx, NULL, LAYER_ATTRIBUTES_NONE); /// TODO: collect file names in xar_get_toc_data_values()
if (rc != CL_SUCCESS) {
goto exit_tmpfile;
2013-09-13 12:22:58 -04:00
}
}
if (a_cksum != NULL) {
xmlFree(a_cksum);
a_cksum = NULL;
}
if (e_cksum != NULL) {
xmlFree(e_cksum);
e_cksum = NULL;
}
}
2013-09-13 12:22:58 -04:00
exit_tmpfile:
2013-09-13 12:22:58 -04:00
xar_cleanup_temp_file(ctx, fd, tmpname);
2014-10-07 12:39:12 -04:00
if (a_hash_ctx != NULL)
2020-08-27 20:42:43 -07:00
xar_hash_final(a_hash_ctx, a_hash_result, a_hash);
2014-10-07 12:39:12 -04:00
if (e_hash_ctx != NULL)
2020-08-27 20:42:43 -07:00
xar_hash_final(e_hash_ctx, e_hash_result, e_hash);
exit_reader:
if (a_cksum != NULL)
xmlFree(a_cksum);
if (e_cksum != NULL)
xmlFree(e_cksum);
xmlTextReaderClose(reader);
2013-09-13 12:22:58 -04:00
xmlFreeTextReader(reader);
exit_toc:
2013-09-13 12:22:58 -04:00
free(toc);
if (rc == CL_BREAK)
rc = CL_SUCCESS;
if (cksum_fails + extract_errors != 0) {
2016-06-21 17:22:52 -04:00
cli_dbgmsg("cli_scanxar: %u checksum errors and %u extraction errors.\n",
cksum_fails, extract_errors);
}
2013-09-13 12:22:58 -04:00
return rc;
}