clamav/libclamav/xar.c

901 lines
34 KiB
C
Raw Normal View History

/*
* Copyright (C) 2013-2021 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
* Copyright (C) 2013 Sourcefire, Inc.
*
2013-09-13 12:22:58 -04:00
* Authors: Steven Morgan <smorgan@sourcefire.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
2013-09-13 12:22:58 -04:00
#include <errno.h>
#include "xar.h"
#include "fmap.h"
2019-03-26 15:09:52 -04:00
2013-09-13 12:22:58 -04:00
#if HAVE_LIBXML2
#include <libxml/xmlreader.h>
2014-07-01 19:38:01 -04:00
#include "clamav.h"
#include "str.h"
#include "scanners.h"
2013-09-13 12:22:58 -04:00
#include "inflate64.h"
#include "lzma_iface.h"
2013-09-13 12:22:58 -04:00
/*
xar_cleanup_temp_file - cleanup after cli_gentempfd
parameters:
ctx - cli_ctx context pointer
fd - fd to close
tmpname - name of file to unlink, address of storage to free
returns - CL_SUCCESS or CL_EUNLINK
*/
static int xar_cleanup_temp_file(cli_ctx *ctx, int fd, char *tmpname)
2013-09-13 12:22:58 -04:00
{
int rc = CL_SUCCESS;
if (fd > -1)
close(fd);
if (tmpname != NULL) {
if (!ctx->engine->keeptmp) {
if (cli_unlink(tmpname)) {
cli_dbgmsg("cli_scanxar: error unlinking tmpfile %s\n", tmpname);
rc = CL_EUNLINK;
}
2013-09-13 12:22:58 -04:00
}
free(tmpname);
2013-09-13 12:22:58 -04:00
}
return rc;
}
/*
xar_get_numeric_from_xml_element - extract xml element value as numeric
parameters:
reader - xmlTextReaderPtr
value - pointer to long to contain the returned value
returns - CL_SUCCESS or CL_EFORMAT
*/
static int xar_get_numeric_from_xml_element(xmlTextReaderPtr reader, size_t *value)
2013-09-13 12:22:58 -04:00
{
const xmlChar *numstr;
2016-06-21 17:22:52 -04:00
2013-09-13 12:22:58 -04:00
if (xmlTextReaderRead(reader) == 1 && xmlTextReaderNodeType(reader) == XML_READER_TYPE_TEXT) {
numstr = xmlTextReaderConstValue(reader);
if (numstr) {
long numval;
2018-02-23 17:15:40 -07:00
char *endptr = NULL;
errno = 0;
numval = strtol((const char *)numstr, &endptr, 10);
if ((((numval == LONG_MAX) || (numval == LONG_MIN)) && errno) ||
((const xmlChar *)endptr == numstr)) {
cli_dbgmsg("cli_scanxar: XML element value invalid\n");
return CL_EFORMAT;
} else if (numval < 0) {
cli_dbgmsg("cli_scanxar: XML element value %li\n", numval);
return CL_EFORMAT;
}
2016-06-21 17:22:52 -04:00
*value = numval;
return CL_SUCCESS;
2013-09-13 12:22:58 -04:00
}
}
cli_dbgmsg("cli_scanxar: No text for XML element\n");
2013-09-13 12:22:58 -04:00
return CL_EFORMAT;
}
/*
xar_get_checksum_values - extract checksum and hash algorithm from xml element
parameters:
reader - xmlTextReaderPtr
cksum - pointer to char* for returning checksum value.
hash - pointer to int for returning checksum algorithm.
returns - void
*/
static void xar_get_checksum_values(xmlTextReaderPtr reader, unsigned char **cksum, int *hash)
{
xmlChar *style = xmlTextReaderGetAttribute(reader, (const xmlChar *)"style");
const xmlChar *xmlval;
*hash = XAR_CKSUM_NONE;
if (style == NULL) {
cli_dbgmsg("cli_scaxar: xmlTextReaderGetAttribute no style attribute "
"for checksum element\n");
} else {
cli_dbgmsg("cli_scanxar: checksum algorithm is %s.\n", style);
2013-09-23 16:18:54 -04:00
if (0 == xmlStrcasecmp(style, (const xmlChar *)"sha1")) {
*hash = XAR_CKSUM_SHA1;
2013-09-23 16:18:54 -04:00
} else if (0 == xmlStrcasecmp(style, (const xmlChar *)"md5")) {
*hash = XAR_CKSUM_MD5;
} else {
cli_dbgmsg("cli_scanxar: checksum algorithm %s is unsupported.\n", style);
*hash = XAR_CKSUM_OTHER;
}
}
2014-02-05 11:00:23 -05:00
if (style != NULL)
xmlFree(style);
if (xmlTextReaderRead(reader) == 1 && xmlTextReaderNodeType(reader) == XML_READER_TYPE_TEXT) {
2013-10-11 12:04:29 -04:00
xmlval = xmlTextReaderConstValue(reader);
if (xmlval) {
2016-06-21 17:22:52 -04:00
cli_dbgmsg("cli_scanxar: checksum value is %s.\n", xmlval);
if (((*hash == XAR_CKSUM_SHA1) && (xmlStrlen(xmlval) == 2 * CLI_HASHLEN_SHA1)) ||
((*hash == XAR_CKSUM_MD5) && (xmlStrlen(xmlval) == 2 * CLI_HASHLEN_MD5))) {
*cksum = xmlStrdup(xmlval);
} else {
cli_dbgmsg("cli_scanxar: checksum type is unknown or length is invalid.\n");
*hash = XAR_CKSUM_OTHER;
*cksum = NULL;
}
} else {
*cksum = NULL;
cli_dbgmsg("cli_scanxar: xmlTextReaderConstValue() returns NULL for checksum value.\n");
}
} else
cli_dbgmsg("cli_scanxar: No text for XML checksum element.\n");
}
2013-09-13 12:22:58 -04:00
/*
xar_get_toc_data_values - return the values of a <data> or <ea> xml element that represent
2013-09-13 12:22:58 -04:00
an extent of data on the heap.
parameters:
reader - xmlTextReaderPtr
length - pointer to long for returning value of the <length> element.
offset - pointer to long for returning value of the <offset> element.
size - pointer to long for returning value of the <size> element.
encoding - pointer to int for returning indication of the <encoding> style attribute.
a_cksum - pointer to char* for return archived checksum value.
a_hash - pointer to int for returning archived checksum algorithm.
e_cksum - pointer to char* for return extracted checksum value.
e_hash - pointer to int for returning extracted checksum algorithm.
2013-09-13 12:22:58 -04:00
returns - CL_FORMAT, CL_SUCCESS, CL_BREAK. CL_BREAK indicates no more <data>/<ea> element.
*/
2016-06-21 17:22:52 -04:00
static int xar_get_toc_data_values(xmlTextReaderPtr reader, size_t *length, size_t *offset, size_t *size, int *encoding,
unsigned char **a_cksum, int *a_hash, unsigned char **e_cksum, int *e_hash)
2013-09-13 12:22:58 -04:00
{
const xmlChar *name;
int indata = 0, inea = 0;
int rc, gotoffset = 0, gotlength = 0, gotsize = 0;
2013-09-13 12:22:58 -04:00
*a_cksum = NULL;
*a_hash = XAR_CKSUM_NONE;
*e_cksum = NULL;
*e_hash = XAR_CKSUM_NONE;
*encoding = CL_TYPE_ANY;
2013-09-13 12:22:58 -04:00
rc = xmlTextReaderRead(reader);
while (rc == 1) {
name = xmlTextReaderConstLocalName(reader);
if (indata || inea) {
/* cli_dbgmsg("cli_scanxar: xmlTextReaderRead read %s\n", name); */
if (xmlStrEqual(name, (const xmlChar *)"offset") &&
2013-09-13 12:22:58 -04:00
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
if (CL_SUCCESS == xar_get_numeric_from_xml_element(reader, offset))
gotoffset = 1;
2013-09-13 12:22:58 -04:00
} else if (xmlStrEqual(name, (const xmlChar *)"length") &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
if (CL_SUCCESS == xar_get_numeric_from_xml_element(reader, length))
gotlength = 1;
2013-09-13 12:22:58 -04:00
} else if (xmlStrEqual(name, (const xmlChar *)"size") &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
if (CL_SUCCESS == xar_get_numeric_from_xml_element(reader, size))
gotsize = 1;
2013-09-13 12:22:58 -04:00
} else if (xmlStrEqual(name, (const xmlChar *)"archived-checksum") &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
cli_dbgmsg("cli_scanxar: <archived-checksum>:\n");
xar_get_checksum_values(reader, a_cksum, a_hash);
} else if ((xmlStrEqual(name, (const xmlChar *)"extracted-checksum") ||
xmlStrEqual(name, (const xmlChar *)"unarchived-checksum")) &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
cli_dbgmsg("cli_scanxar: <extracted-checksum>:\n");
xar_get_checksum_values(reader, e_cksum, e_hash);
2013-09-13 12:22:58 -04:00
} else if (xmlStrEqual(name, (const xmlChar *)"encoding") &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
xmlChar *style = xmlTextReaderGetAttribute(reader, (const xmlChar *)"style");
2013-09-13 12:22:58 -04:00
if (style == NULL) {
cli_dbgmsg("cli_scaxar: xmlTextReaderGetAttribute no style attribute "
2013-09-13 12:22:58 -04:00
"for encoding element\n");
*encoding = CL_TYPE_ANY;
} else if (xmlStrEqual(style, (const xmlChar *)"application/x-gzip")) {
cli_dbgmsg("cli_scanxar: encoding = application/x-gzip.\n");
*encoding = CL_TYPE_GZ;
2013-09-13 12:22:58 -04:00
} else if (xmlStrEqual(style, (const xmlChar *)"application/octet-stream")) {
cli_dbgmsg("cli_scanxar: encoding = application/octet-stream.\n");
*encoding = CL_TYPE_ANY;
2013-09-13 12:22:58 -04:00
} else if (xmlStrEqual(style, (const xmlChar *)"application/x-bzip2")) {
cli_dbgmsg("cli_scanxar: encoding = application/x-bzip2.\n");
*encoding = CL_TYPE_BZ;
} else if (xmlStrEqual(style, (const xmlChar *)"application/x-lzma")) {
cli_dbgmsg("cli_scanxar: encoding = application/x-lzma.\n");
*encoding = CL_TYPE_7Z;
} else if (xmlStrEqual(style, (const xmlChar *)"application/x-xz")) {
2013-09-13 12:22:58 -04:00
cli_dbgmsg("cli_scanxar: encoding = application/x-xz.\n");
2013-10-08 17:17:44 -04:00
*encoding = CL_TYPE_XZ;
2013-09-13 12:22:58 -04:00
} else {
cli_dbgmsg("cli_scaxar: unknown style value=%s for encoding element\n", style);
2013-09-13 12:22:58 -04:00
*encoding = CL_TYPE_ANY;
}
2014-02-05 11:00:23 -05:00
if (style != NULL)
xmlFree(style);
} else if (indata && xmlStrEqual(name, (const xmlChar *)"data") &&
2013-09-13 12:22:58 -04:00
xmlTextReaderNodeType(reader) == XML_READER_TYPE_END_ELEMENT) {
break;
2013-09-13 12:22:58 -04:00
} else if (inea && xmlStrEqual(name, (const xmlChar *)"ea") &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_END_ELEMENT) {
break;
}
2013-09-13 12:22:58 -04:00
} else {
if (xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
if (xmlStrEqual(name, (const xmlChar *)"data")) {
cli_dbgmsg("cli_scanxar: xmlTextReaderRead read <data>\n");
2013-09-13 12:22:58 -04:00
indata = 1;
} else if (xmlStrEqual(name, (const xmlChar *)"ea")) {
cli_dbgmsg("cli_scanxar: xmlTextReaderRead read <ea>\n");
2013-09-13 12:22:58 -04:00
inea = 1;
}
} else if ((xmlTextReaderNodeType(reader) == XML_READER_TYPE_END_ELEMENT) &&
xmlStrEqual(name, (const xmlChar *)"xar")) {
cli_dbgmsg("cli_scanxar: finished parsing xar TOC.\n");
break;
2013-09-13 12:22:58 -04:00
}
}
rc = xmlTextReaderRead(reader);
}
2013-09-13 12:22:58 -04:00
if (gotoffset && gotlength && gotsize) {
rc = CL_SUCCESS;
} else if (0 == gotoffset + gotlength + gotsize)
2013-09-13 12:22:58 -04:00
rc = CL_BREAK;
else
rc = CL_EFORMAT;
return rc;
}
/*
xar_process_subdocument - check TOC for xml subdocument. If found, extract and
scan in memory.
Parameters:
reader - xmlTextReaderPtr
ctx - pointer to cli_ctx
Returns:
CL_SUCCESS - subdoc found and clean scan (or virus found and SCAN_ALLMATCHES), or no subdocument
other - error return code from cli_magic_scan_buff()
*/
static int xar_scan_subdocuments(xmlTextReaderPtr reader, cli_ctx *ctx)
{
int rc = CL_SUCCESS, subdoc_len, fd;
xmlChar *subdoc;
const xmlChar *name;
char *tmpname;
while (xmlTextReaderRead(reader) == 1) {
name = xmlTextReaderConstLocalName(reader);
if (name == NULL) {
cli_dbgmsg("cli_scanxar: xmlTextReaderConstLocalName() no name.\n");
rc = CL_EFORMAT;
break;
}
if (xmlStrEqual(name, (const xmlChar *)"toc") &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT)
return CL_SUCCESS;
if (xmlStrEqual(name, (const xmlChar *)"subdoc") &&
xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT) {
subdoc = xmlTextReaderReadInnerXml(reader);
if (subdoc == NULL) {
cli_dbgmsg("cli_scanxar: no content in subdoc element.\n");
xmlTextReaderNext(reader);
continue;
}
subdoc_len = xmlStrlen(subdoc);
cli_dbgmsg("cli_scanxar: in-memory scan of xml subdocument, len %i.\n", subdoc_len);
rc = cli_magic_scan_buff(subdoc, subdoc_len, ctx, NULL);
if (rc == CL_VIRUS && SCAN_ALLMATCHES)
rc = CL_SUCCESS;
/* make a file to leave if --leave-temps in effect */
if (ctx->engine->keeptmp) {
Record names of extracted files A way is needed to record scanned file names for two purposes: 1. File names (and extensions) must be stored in the json metadata properties recorded when using the --gen-json clamscan option. Future work may use this to compare file extensions with detected file types. 2. File names are useful when interpretting tmp directory output when using the --leave-temps option. This commit enables file name retention for later use by storing file names in the fmap header structure, if a file name exists. To store the names in fmaps, an optional name argument has been added to any internal scan API's that create fmaps and every call to these APIs has been modified to pass a file name or NULL if a file name is not required. The zip and gpt parsers required some modification to record file names. The NSIS and XAR parsers fail to collect file names at all and will require future work to support file name extraction. Also: - Added recursive extraction to the tmp directory when the --leave-temps option is enabled. When not enabled, the tmp directory structure remains flat so as to prevent the likelihood of exceeding MAX_PATH. The current tmp directory is stored in the scan context. - Made the cli_scanfile() internal API non-static and added it to scanners.h so it would be accessible outside of scanners.c in order to remove code duplication within libmspack.c. - Added function comments to scanners.h and matcher.h - Converted a TDB-type macros and LSIG-type macros to enums for improved type safey. - Converted more return status variables from `int` to `cl_error_t` for improved type safety, and corrected ooxml file typing functions so they use `cli_file_t` exclusively rather than mixing types with `cl_error_t`. - Restructured the magic_scandesc() function to use goto's for error handling and removed the early_ret_from_magicscan() macro and magic_scandesc_cleanup() function. This makes the code easier to read and made it easier to add the recursive tmp directory cleanup to magic_scandesc(). - Corrected zip, egg, rar filename extraction issues. - Removed use of extra sub-directory layer for zip, egg, and rar file extraction. For Zip, this also involved changing the extracted filenames to be randomly generated rather than using the "zip.###" file name scheme.
2020-03-19 21:23:54 -04:00
if ((rc = cli_gentempfd(ctx->sub_tmpdir, &tmpname, &fd)) != CL_SUCCESS) {
cli_dbgmsg("cli_scanxar: Can't create temporary file for subdocument.\n");
} else {
cli_dbgmsg("cli_scanxar: Writing subdoc to temp file %s.\n", tmpname);
if (cli_writen(fd, subdoc, subdoc_len) == (size_t)-1) {
cli_dbgmsg("cli_scanxar: cli_writen error writing subdoc temporary file.\n");
rc = CL_EWRITE;
}
rc = xar_cleanup_temp_file(ctx, fd, tmpname);
tmpname = NULL;
}
}
xmlFree(subdoc);
if (rc != CL_SUCCESS)
return rc;
xmlTextReaderNext(reader);
}
}
return rc;
}
static void *xar_hash_init(int hash, void **sc, void **mc)
{
if (!sc && !mc)
return NULL;
switch (hash) {
case XAR_CKSUM_SHA1:
*sc = cl_hash_init("sha1");
if (!(*sc)) {
return NULL;
}
return *sc;
case XAR_CKSUM_MD5:
*mc = cl_hash_init("md5");
if (!(*mc)) {
return NULL;
}
return *mc;
case XAR_CKSUM_OTHER:
case XAR_CKSUM_NONE:
default:
return NULL;
}
}
static void xar_hash_update(void *hash_ctx, void *data, unsigned long size, int hash)
{
if (!hash_ctx || !data || !size)
return;
2014-02-08 00:31:12 -05:00
switch (hash) {
case XAR_CKSUM_NONE:
case XAR_CKSUM_OTHER:
return;
}
2014-02-08 00:31:12 -05:00
cl_update_hash(hash_ctx, data, size);
}
static void xar_hash_final(void *hash_ctx, void *result, int hash)
{
if (!hash_ctx || !result)
return;
2014-02-08 00:31:12 -05:00
switch (hash) {
case XAR_CKSUM_OTHER:
case XAR_CKSUM_NONE:
return;
}
2014-02-08 00:31:12 -05:00
cl_finish_hash(hash_ctx, result);
}
static int xar_hash_check(int hash, const void *result, const void *expected)
{
int len;
if (!result || !expected)
return 1;
switch (hash) {
case XAR_CKSUM_SHA1:
len = CLI_HASHLEN_SHA1;
break;
case XAR_CKSUM_MD5:
len = CLI_HASHLEN_MD5;
break;
case XAR_CKSUM_OTHER:
case XAR_CKSUM_NONE:
default:
return 1;
}
return memcmp(result, expected, len);
}
2013-09-13 12:22:58 -04:00
#endif
2013-09-13 12:22:58 -04:00
/*
cli_scanxar - scan an xar archive.
Parameters:
ctx - pointer to cli_ctx.
returns - CL_SUCCESS or CL_ error code.
*/
int cli_scanxar(cli_ctx *ctx)
{
int rc = CL_SUCCESS;
unsigned int cksum_fails = 0;
unsigned int extract_errors = 0;
2013-09-13 12:22:58 -04:00
#if HAVE_LIBXML2
int fd = -1;
struct xar_header hdr;
2013-09-13 12:22:58 -04:00
fmap_t *map = *ctx->fmap;
2016-06-21 17:22:52 -04:00
size_t length, offset, size, at;
2013-09-13 12:22:58 -04:00
int encoding;
z_stream strm;
char *toc, *tmpname = NULL;
2013-09-13 12:22:58 -04:00
xmlTextReaderPtr reader = NULL;
int a_hash, e_hash;
2013-10-11 12:04:29 -04:00
unsigned char *a_cksum = NULL, *e_cksum = NULL;
2014-10-07 12:39:12 -04:00
void *a_hash_ctx = NULL, *e_hash_ctx = NULL;
2020-08-27 20:42:43 -07:00
char e_hash_result[SHA1_HASH_SIZE];
char a_hash_result[SHA1_HASH_SIZE];
2013-09-13 12:22:58 -04:00
memset(&strm, 0x00, sizeof(z_stream));
2013-09-13 12:22:58 -04:00
/* retrieve xar header */
if (fmap_readn(*ctx->fmap, &hdr, 0, sizeof(hdr)) != sizeof(hdr)) {
cli_dbgmsg("cli_scanxar: Invalid header, too short.\n");
return CL_EFORMAT;
}
hdr.magic = be32_to_host(hdr.magic);
2013-09-13 12:22:58 -04:00
if (hdr.magic == XAR_HEADER_MAGIC) {
cli_dbgmsg("cli_scanxar: Matched magic\n");
} else {
cli_dbgmsg("cli_scanxar: Invalid magic\n");
return CL_EFORMAT;
}
hdr.size = be16_to_host(hdr.size);
hdr.version = be16_to_host(hdr.version);
hdr.toc_length_compressed = be64_to_host(hdr.toc_length_compressed);
2013-09-13 12:22:58 -04:00
hdr.toc_length_decompressed = be64_to_host(hdr.toc_length_decompressed);
hdr.chksum_alg = be32_to_host(hdr.chksum_alg);
2013-09-13 12:22:58 -04:00
/* cli_dbgmsg("hdr.magic %x\n", hdr.magic); */
/* cli_dbgmsg("hdr.size %i\n", hdr.size); */
/* cli_dbgmsg("hdr.version %i\n", hdr.version); */
/* cli_dbgmsg("hdr.toc_length_compressed %lu\n", hdr.toc_length_compressed); */
/* cli_dbgmsg("hdr.toc_length_decompressed %lu\n", hdr.toc_length_decompressed); */
/* cli_dbgmsg("hdr.chksum_alg %i\n", hdr.chksum_alg); */
2013-09-13 12:22:58 -04:00
/* Uncompress TOC */
strm.next_in = (unsigned char *)fmap_need_off_once(*ctx->fmap, hdr.size, hdr.toc_length_compressed);
if (strm.next_in == NULL) {
cli_dbgmsg("cli_scanxar: fmap_need_off_once fails on TOC.\n");
return CL_EREAD;
2013-09-13 12:22:58 -04:00
}
strm.avail_in = hdr.toc_length_compressed;
toc = cli_malloc(hdr.toc_length_decompressed + 1);
2013-09-13 12:22:58 -04:00
if (toc == NULL) {
cli_dbgmsg("cli_scanxar: cli_malloc fails on TOC decompress buffer.\n");
2013-09-13 12:22:58 -04:00
return CL_EMEM;
}
toc[hdr.toc_length_decompressed] = '\0';
strm.avail_out = hdr.toc_length_decompressed;
strm.next_out = (unsigned char *)toc;
rc = inflateInit(&strm);
2013-09-13 12:22:58 -04:00
if (rc != Z_OK) {
cli_dbgmsg("cli_scanxar:inflateInit error %i \n", rc);
2013-09-13 12:22:58 -04:00
rc = CL_EFORMAT;
goto exit_toc;
}
2013-09-13 12:22:58 -04:00
rc = inflate(&strm, Z_SYNC_FLUSH);
if (rc != Z_OK && rc != Z_STREAM_END) {
inflateEnd(&strm);
cli_dbgmsg("cli_scanxar:inflate error %i \n", rc);
2013-09-13 12:22:58 -04:00
rc = CL_EFORMAT;
goto exit_toc;
}
rc = inflateEnd(&strm);
if (rc != Z_OK) {
cli_dbgmsg("cli_scanxar:inflateEnd error %i \n", rc);
2013-09-13 12:22:58 -04:00
rc = CL_EFORMAT;
goto exit_toc;
}
2016-06-21 17:22:52 -04:00
if (hdr.toc_length_decompressed != strm.total_out) {
cli_dbgmsg("TOC decompress length %" PRIu64 " does not match amount decompressed %lu\n",
hdr.toc_length_decompressed, strm.total_out);
toc[strm.total_out] = '\0';
2016-06-21 17:22:52 -04:00
hdr.toc_length_decompressed = strm.total_out;
}
/* cli_dbgmsg("cli_scanxar: TOC xml:\n%s\n", toc); */
/* printf("cli_scanxar: TOC xml:\n%s\n", toc); */
/* cli_dbgmsg("cli_scanxar: TOC end:\n"); */
/* printf("cli_scanxar: TOC end:\n"); */
2013-09-13 12:22:58 -04:00
/* scan the xml */
cli_dbgmsg("cli_scanxar: scanning xar TOC xml in memory.\n");
rc = cli_magic_scan_buff(toc, hdr.toc_length_decompressed, ctx, NULL);
2013-09-13 12:22:58 -04:00
if (rc != CL_SUCCESS) {
if (rc != CL_VIRUS || !SCAN_ALLMATCHES)
goto exit_toc;
2013-09-13 12:22:58 -04:00
}
/* make a file to leave if --leave-temps in effect */
if (ctx->engine->keeptmp) {
Record names of extracted files A way is needed to record scanned file names for two purposes: 1. File names (and extensions) must be stored in the json metadata properties recorded when using the --gen-json clamscan option. Future work may use this to compare file extensions with detected file types. 2. File names are useful when interpretting tmp directory output when using the --leave-temps option. This commit enables file name retention for later use by storing file names in the fmap header structure, if a file name exists. To store the names in fmaps, an optional name argument has been added to any internal scan API's that create fmaps and every call to these APIs has been modified to pass a file name or NULL if a file name is not required. The zip and gpt parsers required some modification to record file names. The NSIS and XAR parsers fail to collect file names at all and will require future work to support file name extraction. Also: - Added recursive extraction to the tmp directory when the --leave-temps option is enabled. When not enabled, the tmp directory structure remains flat so as to prevent the likelihood of exceeding MAX_PATH. The current tmp directory is stored in the scan context. - Made the cli_scanfile() internal API non-static and added it to scanners.h so it would be accessible outside of scanners.c in order to remove code duplication within libmspack.c. - Added function comments to scanners.h and matcher.h - Converted a TDB-type macros and LSIG-type macros to enums for improved type safey. - Converted more return status variables from `int` to `cl_error_t` for improved type safety, and corrected ooxml file typing functions so they use `cli_file_t` exclusively rather than mixing types with `cl_error_t`. - Restructured the magic_scandesc() function to use goto's for error handling and removed the early_ret_from_magicscan() macro and magic_scandesc_cleanup() function. This makes the code easier to read and made it easier to add the recursive tmp directory cleanup to magic_scandesc(). - Corrected zip, egg, rar filename extraction issues. - Removed use of extra sub-directory layer for zip, egg, and rar file extraction. For Zip, this also involved changing the extracted filenames to be randomly generated rather than using the "zip.###" file name scheme.
2020-03-19 21:23:54 -04:00
if ((rc = cli_gentempfd(ctx->sub_tmpdir, &tmpname, &fd)) != CL_SUCCESS) {
cli_dbgmsg("cli_scanxar: Can't create temporary file for TOC.\n");
2013-09-13 12:22:58 -04:00
goto exit_toc;
}
if (cli_writen(fd, toc, hdr.toc_length_decompressed) == (size_t)-1) {
cli_dbgmsg("cli_scanxar: cli_writen error writing TOC.\n");
2013-09-13 12:22:58 -04:00
rc = CL_EWRITE;
xar_cleanup_temp_file(ctx, fd, tmpname);
goto exit_toc;
}
rc = xar_cleanup_temp_file(ctx, fd, tmpname);
tmpname = NULL;
2013-09-13 12:22:58 -04:00
if (rc != CL_SUCCESS)
goto exit_toc;
}
reader = xmlReaderForMemory(toc, hdr.toc_length_decompressed, "noname.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
2013-09-13 12:22:58 -04:00
if (reader == NULL) {
cli_dbgmsg("cli_scanxar: xmlReaderForMemory error for TOC\n");
2013-09-13 12:22:58 -04:00
goto exit_toc;
}
rc = xar_scan_subdocuments(reader, ctx);
if (rc != CL_SUCCESS) {
cli_dbgmsg("xar_scan_subdocuments returns %i.\n", rc);
2013-12-04 09:57:05 -05:00
goto exit_reader;
}
2013-09-13 12:22:58 -04:00
/* Walk the TOC XML and extract files */
fd = -1;
2013-09-13 12:22:58 -04:00
tmpname = NULL;
while (CL_SUCCESS == (rc = xar_get_toc_data_values(reader, &length, &offset, &size, &encoding,
&a_cksum, &a_hash, &e_cksum, &e_hash))) {
int do_extract_cksum = 1;
unsigned char *blockp;
void *a_sc, *e_sc;
void *a_mc, *e_mc;
char *expected;
2013-09-13 12:22:58 -04:00
/* clean up temp file from previous loop iteration */
if (fd > -1 && tmpname) {
rc = xar_cleanup_temp_file(ctx, fd, tmpname);
tmpname = NULL;
2013-09-13 12:22:58 -04:00
if (rc != CL_SUCCESS)
goto exit_reader;
}
at = offset + hdr.toc_length_compressed + hdr.size;
Record names of extracted files A way is needed to record scanned file names for two purposes: 1. File names (and extensions) must be stored in the json metadata properties recorded when using the --gen-json clamscan option. Future work may use this to compare file extensions with detected file types. 2. File names are useful when interpretting tmp directory output when using the --leave-temps option. This commit enables file name retention for later use by storing file names in the fmap header structure, if a file name exists. To store the names in fmaps, an optional name argument has been added to any internal scan API's that create fmaps and every call to these APIs has been modified to pass a file name or NULL if a file name is not required. The zip and gpt parsers required some modification to record file names. The NSIS and XAR parsers fail to collect file names at all and will require future work to support file name extraction. Also: - Added recursive extraction to the tmp directory when the --leave-temps option is enabled. When not enabled, the tmp directory structure remains flat so as to prevent the likelihood of exceeding MAX_PATH. The current tmp directory is stored in the scan context. - Made the cli_scanfile() internal API non-static and added it to scanners.h so it would be accessible outside of scanners.c in order to remove code duplication within libmspack.c. - Added function comments to scanners.h and matcher.h - Converted a TDB-type macros and LSIG-type macros to enums for improved type safey. - Converted more return status variables from `int` to `cl_error_t` for improved type safety, and corrected ooxml file typing functions so they use `cli_file_t` exclusively rather than mixing types with `cl_error_t`. - Restructured the magic_scandesc() function to use goto's for error handling and removed the early_ret_from_magicscan() macro and magic_scandesc_cleanup() function. This makes the code easier to read and made it easier to add the recursive tmp directory cleanup to magic_scandesc(). - Corrected zip, egg, rar filename extraction issues. - Removed use of extra sub-directory layer for zip, egg, and rar file extraction. For Zip, this also involved changing the extracted filenames to be randomly generated rather than using the "zip.###" file name scheme.
2020-03-19 21:23:54 -04:00
if ((rc = cli_gentempfd(ctx->sub_tmpdir, &tmpname, &fd)) != CL_SUCCESS) {
cli_dbgmsg("cli_scanxar: Can't generate temporary file.\n");
2013-09-13 12:22:58 -04:00
goto exit_reader;
}
2016-06-21 17:22:52 -04:00
cli_dbgmsg("cli_scanxar: decompress into temp file:\n%s, size %zu,\n"
"from xar heap offset %zu length %zu\n",
tmpname, size, offset, length);
2013-09-13 12:22:58 -04:00
a_hash_ctx = xar_hash_init(a_hash, &a_sc, &a_mc);
e_hash_ctx = xar_hash_init(e_hash, &e_sc, &e_mc);
2013-09-13 12:22:58 -04:00
switch (encoding) {
case CL_TYPE_GZ:
/* inflate gzip directly because file segments do not contain magic */
memset(&strm, 0, sizeof(strm));
if ((rc = inflateInit(&strm)) != Z_OK) {
cli_dbgmsg("cli_scanxar: InflateInit failed: %d\n", rc);
rc = CL_EFORMAT;
extract_errors++;
break;
2013-09-13 12:22:58 -04:00
}
while ((size_t)at < map->len && (unsigned long)at < offset + hdr.toc_length_compressed + hdr.size + length) {
unsigned long avail_in;
void *next_in;
unsigned int bytes = MIN(map->len - at, map->pgsz);
bytes = MIN(length, bytes);
if (!(strm.next_in = next_in = (void *)fmap_need_off_once(map, at, bytes))) {
cli_dbgmsg("cli_scanxar: Can't read %u bytes @ %lu.\n", bytes, (long unsigned)at);
2013-09-13 12:22:58 -04:00
inflateEnd(&strm);
rc = CL_EREAD;
2013-09-13 12:22:58 -04:00
goto exit_tmpfile;
}
at += bytes;
strm.avail_in = avail_in = bytes;
do {
int inf, outsize = 0;
unsigned char buff[FILEBUFF];
strm.avail_out = sizeof(buff);
strm.next_out = buff;
inf = inflate(&strm, Z_SYNC_FLUSH);
if (inf != Z_OK && inf != Z_STREAM_END && inf != Z_BUF_ERROR) {
cli_dbgmsg("cli_scanxar: inflate error %i %s.\n", inf, strm.msg ? strm.msg : "");
rc = CL_EFORMAT;
extract_errors++;
break;
}
bytes = sizeof(buff) - strm.avail_out;
if (e_hash_ctx != NULL)
xar_hash_update(e_hash_ctx, buff, bytes, e_hash);
if (cli_writen(fd, buff, bytes) == (size_t)-1) {
cli_dbgmsg("cli_scanxar: cli_writen error file %s.\n", tmpname);
inflateEnd(&strm);
rc = CL_EWRITE;
goto exit_tmpfile;
}
outsize += sizeof(buff) - strm.avail_out;
if (cli_checklimits("cli_scanxar", ctx, outsize, 0, 0) != CL_CLEAN) {
break;
}
if (inf == Z_STREAM_END) {
break;
}
} while (strm.avail_out == 0);
if (rc != CL_SUCCESS)
2013-09-13 12:22:58 -04:00
break;
avail_in -= strm.avail_in;
if (a_hash_ctx != NULL)
xar_hash_update(a_hash_ctx, next_in, avail_in, a_hash);
}
inflateEnd(&strm);
break;
case CL_TYPE_7Z:
#define CLI_LZMA_OBUF_SIZE 1024 * 1024
#define CLI_LZMA_HDR_SIZE LZMA_PROPS_SIZE + 8
#define CLI_LZMA_IBUF_SIZE CLI_LZMA_OBUF_SIZE >> 2 /* estimated compression ratio 25% */
2013-10-08 17:17:44 -04:00
{
2013-10-11 12:04:29 -04:00
struct CLI_LZMA lz;
2016-06-21 17:22:52 -04:00
unsigned long in_remaining = MIN(length, map->len - at);
unsigned long out_size = 0;
unsigned char *buff = __lzma_wrap_alloc(NULL, CLI_LZMA_OBUF_SIZE);
int lret;
2016-06-21 17:22:52 -04:00
if (length > in_remaining)
length = in_remaining;
2013-10-11 12:04:29 -04:00
memset(&lz, 0, sizeof(lz));
if (buff == NULL) {
cli_dbgmsg("cli_scanxar: memory request for lzma decompression buffer fails.\n");
rc = CL_EMEM;
goto exit_tmpfile;
}
blockp = (void *)fmap_need_off_once(map, at, CLI_LZMA_HDR_SIZE);
if (blockp == NULL) {
char errbuff[128];
cli_strerror(errno, errbuff, sizeof(errbuff));
cli_dbgmsg("cli_scanxar: Can't read %i bytes @ %zu, errno:%s.\n",
2016-06-21 17:22:52 -04:00
CLI_LZMA_HDR_SIZE, at, errbuff);
rc = CL_EREAD;
__lzma_wrap_free(NULL, buff);
goto exit_tmpfile;
}
lz.next_in = blockp;
lz.avail_in = CLI_LZMA_HDR_SIZE;
if (a_hash_ctx != NULL)
xar_hash_update(a_hash_ctx, blockp, CLI_LZMA_HDR_SIZE, a_hash);
lret = cli_LzmaInit(&lz, 0);
if (lret != LZMA_RESULT_OK) {
cli_dbgmsg("cli_scanxar: cli_LzmaInit() fails: %i.\n", lret);
rc = CL_EFORMAT;
__lzma_wrap_free(NULL, buff);
extract_errors++;
break;
}
at += CLI_LZMA_HDR_SIZE;
in_remaining -= CLI_LZMA_HDR_SIZE;
while (at < map->len && at < offset + (size_t)hdr.toc_length_compressed + (size_t)hdr.size + length) {
SizeT avail_in;
SizeT avail_out;
void *next_in;
unsigned long in_consumed;
lz.next_out = buff;
lz.avail_out = CLI_LZMA_OBUF_SIZE;
2013-10-08 17:17:44 -04:00
lz.avail_in = avail_in = MIN(CLI_LZMA_IBUF_SIZE, in_remaining);
lz.next_in = next_in = (void *)fmap_need_off_once(map, at, lz.avail_in);
if (lz.next_in == NULL) {
char errbuff[128];
cli_strerror(errno, errbuff, sizeof(errbuff));
cli_dbgmsg("cli_scanxar: Can't read %zu bytes @ %zu, errno: %s.\n",
2016-06-21 17:22:52 -04:00
lz.avail_in, at, errbuff);
rc = CL_EREAD;
__lzma_wrap_free(NULL, buff);
cli_LzmaShutdown(&lz);
goto exit_tmpfile;
}
lret = cli_LzmaDecode(&lz);
if (lret != LZMA_RESULT_OK && lret != LZMA_STREAM_END) {
cli_dbgmsg("cli_scanxar: cli_LzmaDecode() fails: %i.\n", lret);
rc = CL_EFORMAT;
extract_errors++;
break;
}
in_consumed = avail_in - lz.avail_in;
in_remaining -= in_consumed;
at += in_consumed;
avail_out = CLI_LZMA_OBUF_SIZE - lz.avail_out;
if (avail_out == 0)
cli_dbgmsg("cli_scanxar: cli_LzmaDecode() produces no output for "
2016-02-22 13:26:15 -05:00
"avail_in %llu, avail_out %llu.\n",
(long long unsigned)avail_in, (long long unsigned)avail_out);
if (a_hash_ctx != NULL)
xar_hash_update(a_hash_ctx, next_in, in_consumed, a_hash);
if (e_hash_ctx != NULL)
xar_hash_update(e_hash_ctx, buff, avail_out, e_hash);
/* Write a decompressed block. */
/* cli_dbgmsg("Writing %li bytes to LZMA decompress temp file, " */
/* "consumed %li of %li available compressed bytes.\n", */
/* avail_out, in_consumed, avail_in); */
if (cli_writen(fd, buff, avail_out) == (size_t)-1) {
2016-02-22 13:26:15 -05:00
cli_dbgmsg("cli_scanxar: cli_writen error writing lzma temp file for %llu bytes.\n",
(long long unsigned)avail_out);
__lzma_wrap_free(NULL, buff);
cli_LzmaShutdown(&lz);
rc = CL_EWRITE;
goto exit_tmpfile;
}
/* Check file size limitation. */
out_size += avail_out;
if (cli_checklimits("cli_scanxar", ctx, out_size, 0, 0) != CL_CLEAN) {
break;
}
if (lret == LZMA_STREAM_END)
break;
}
cli_LzmaShutdown(&lz);
__lzma_wrap_free(NULL, buff);
} break;
case CL_TYPE_ANY:
default:
case CL_TYPE_BZ:
case CL_TYPE_XZ:
/* for uncompressed, bzip2, xz, and unknown, just pull the file, cli_magic_scan_desc does the rest */
do_extract_cksum = 0;
{
size_t writelen = MIN(map->len - at, length);
2016-06-21 17:22:52 -04:00
if (ctx->engine->maxfilesize)
writelen = MIN((size_t)(ctx->engine->maxfilesize), writelen);
if (!(blockp = (void *)fmap_need_off_once(map, at, writelen))) {
char errbuff[128];
cli_strerror(errno, errbuff, sizeof(errbuff));
cli_dbgmsg("cli_scanxar: Can't read %zu bytes @ %zu, errno:%s.\n",
writelen, at, errbuff);
rc = CL_EREAD;
goto exit_tmpfile;
}
if (a_hash_ctx != NULL)
xar_hash_update(a_hash_ctx, blockp, writelen, a_hash);
if (cli_writen(fd, blockp, writelen) == (size_t)-1) {
cli_dbgmsg("cli_scanxar: cli_writen error %zu bytes @ %zu.\n", writelen, at);
rc = CL_EWRITE;
goto exit_tmpfile;
}
/*break;*/
}
2016-06-21 17:22:52 -04:00
} /* end of switch */
if (a_hash_ctx != NULL) {
2020-08-27 20:42:43 -07:00
xar_hash_final(a_hash_ctx, a_hash_result, a_hash);
a_hash_ctx = NULL;
} else if (rc == CL_SUCCESS) {
cli_dbgmsg("cli_scanxar: archived-checksum missing.\n");
cksum_fails++;
}
if (e_hash_ctx != NULL) {
2020-08-27 20:42:43 -07:00
xar_hash_final(e_hash_ctx, e_hash_result, e_hash);
e_hash_ctx = NULL;
} else if (rc == CL_SUCCESS) {
cli_dbgmsg("cli_scanxar: extracted-checksum(unarchived-checksum) missing.\n");
cksum_fails++;
}
if (rc == CL_SUCCESS) {
if (a_cksum != NULL) {
expected = cli_hex2str((char *)a_cksum);
2020-08-27 20:42:43 -07:00
if (xar_hash_check(a_hash, a_hash_result, expected) != 0) {
cli_dbgmsg("cli_scanxar: archived-checksum mismatch.\n");
cksum_fails++;
} else {
cli_dbgmsg("cli_scanxar: archived-checksum matched.\n");
}
free(expected);
2013-09-13 12:22:58 -04:00
}
if (e_cksum != NULL) {
if (do_extract_cksum) {
expected = cli_hex2str((char *)e_cksum);
2020-08-27 20:42:43 -07:00
if (xar_hash_check(e_hash, e_hash_result, expected) != 0) {
cli_dbgmsg("cli_scanxar: extracted-checksum mismatch.\n");
cksum_fails++;
} else {
cli_dbgmsg("cli_scanxar: extracted-checksum matched.\n");
}
free(expected);
}
}
rc = cli_magic_scan_desc(fd, tmpname, ctx, NULL); /// TODO: collect file names in xar_get_toc_data_values()
if (rc != CL_SUCCESS) {
if (rc == CL_VIRUS) {
cli_dbgmsg("cli_scanxar: Infected with %s\n", cli_get_last_virus(ctx));
if (!SCAN_ALLMATCHES)
goto exit_tmpfile;
} else if (rc != CL_BREAK) {
cli_dbgmsg("cli_scanxar: cli_magic_scan_desc error %i\n", rc);
2013-09-13 12:22:58 -04:00
goto exit_tmpfile;
}
2013-09-13 12:22:58 -04:00
}
}
if (a_cksum != NULL) {
xmlFree(a_cksum);
a_cksum = NULL;
}
if (e_cksum != NULL) {
xmlFree(e_cksum);
e_cksum = NULL;
}
}
2013-09-13 12:22:58 -04:00
exit_tmpfile:
2013-09-13 12:22:58 -04:00
xar_cleanup_temp_file(ctx, fd, tmpname);
2014-10-07 12:39:12 -04:00
if (a_hash_ctx != NULL)
2020-08-27 20:42:43 -07:00
xar_hash_final(a_hash_ctx, a_hash_result, a_hash);
2014-10-07 12:39:12 -04:00
if (e_hash_ctx != NULL)
2020-08-27 20:42:43 -07:00
xar_hash_final(e_hash_ctx, e_hash_result, e_hash);
exit_reader:
if (a_cksum != NULL)
xmlFree(a_cksum);
if (e_cksum != NULL)
xmlFree(e_cksum);
xmlTextReaderClose(reader);
2013-09-13 12:22:58 -04:00
xmlFreeTextReader(reader);
exit_toc:
2013-09-13 12:22:58 -04:00
free(toc);
if (rc == CL_BREAK)
rc = CL_SUCCESS;
#else
2013-09-13 12:39:04 -04:00
cli_dbgmsg("cli_scanxar: can't scan xar files, need libxml2.\n");
2013-09-13 12:22:58 -04:00
#endif
if (cksum_fails + extract_errors != 0) {
2016-06-21 17:22:52 -04:00
cli_dbgmsg("cli_scanxar: %u checksum errors and %u extraction errors.\n",
cksum_fails, extract_errors);
}
2013-09-13 12:22:58 -04:00
return rc;
}