clamav/libclamav/filetypes.c

479 lines
20 KiB
C
Raw Normal View History

/*
2020-01-03 15:44:07 -05:00
* Copyright (C) 2013-2020 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
* Copyright (C) 2007-2013 Sourcefire, Inc.
*
* Authors: Tomasz Kojm
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "clamav.h"
#include "filetypes.h"
#include "others.h"
#include "readdb.h"
#include "matcher-ac.h"
#include "str.h"
#include "textdet.h"
#include "default.h"
#include "iowrap.h"
#include "mbr.h"
#include "gpt.h"
2014-11-24 10:47:44 -05:00
#include "ooxml.h"
#include "htmlnorm.h"
#include "entconv.h"
#include "mpool.h"
#define UNZIP_PRIVATE
#include "unzip.h"
// clang-format off
static const struct ftmap_s {
const char *name;
cli_file_t code;
} ftmap[] = {
{ "CL_TYPE_TEXT_ASCII", CL_TYPE_TEXT_ASCII },
{ "CL_TYPE_TEXT_UTF8", CL_TYPE_TEXT_UTF8 },
{ "CL_TYPE_TEXT_UTF16LE", CL_TYPE_TEXT_UTF16LE },
{ "CL_TYPE_TEXT_UTF16BE", CL_TYPE_TEXT_UTF16BE },
{ "CL_TYPE_BINARY_DATA", CL_TYPE_BINARY_DATA },
{ "CL_TYPE_IGNORED", CL_TYPE_IGNORED },
{ "CL_TYPE_ANY", CL_TYPE_ANY },
{ "CL_TYPE_MSEXE", CL_TYPE_MSEXE },
{ "CL_TYPE_ELF", CL_TYPE_ELF },
{ "CL_TYPE_MACHO", CL_TYPE_MACHO },
{ "CL_TYPE_MACHO_UNIBIN", CL_TYPE_MACHO_UNIBIN },
{ "CL_TYPE_POSIX_TAR", CL_TYPE_POSIX_TAR },
{ "CL_TYPE_OLD_TAR", CL_TYPE_OLD_TAR },
{ "CL_TYPE_CPIO_OLD", CL_TYPE_CPIO_OLD },
{ "CL_TYPE_CPIO_ODC", CL_TYPE_CPIO_ODC },
{ "CL_TYPE_CPIO_NEWC", CL_TYPE_CPIO_NEWC },
{ "CL_TYPE_CPIO_CRC", CL_TYPE_CPIO_CRC },
{ "CL_TYPE_GZ", CL_TYPE_GZ },
{ "CL_TYPE_ZIP", CL_TYPE_ZIP },
{ "CL_TYPE_BZ", CL_TYPE_BZ },
{ "CL_TYPE_RAR", CL_TYPE_RAR },
{ "CL_TYPE_ARJ", CL_TYPE_ARJ },
{ "CL_TYPE_MSSZDD", CL_TYPE_MSSZDD },
{ "CL_TYPE_MSOLE2", CL_TYPE_MSOLE2 },
{ "CL_TYPE_MSCAB", CL_TYPE_MSCAB },
{ "CL_TYPE_MSCHM", CL_TYPE_MSCHM },
{ "CL_TYPE_SIS", CL_TYPE_SIS },
{ "CL_TYPE_SCRENC", CL_TYPE_SCRENC },
{ "CL_TYPE_GRAPHICS", CL_TYPE_GRAPHICS },
{ "CL_TYPE_GIF", CL_TYPE_GIF },
{ "CL_TYPE_PNG", CL_TYPE_PNG },
{ "CL_TYPE_RIFF", CL_TYPE_RIFF },
{ "CL_TYPE_BINHEX", CL_TYPE_BINHEX },
{ "CL_TYPE_TNEF", CL_TYPE_TNEF },
{ "CL_TYPE_CRYPTFF", CL_TYPE_CRYPTFF },
{ "CL_TYPE_PDF", CL_TYPE_PDF },
{ "CL_TYPE_UUENCODED", CL_TYPE_UUENCODED },
{ "CL_TYPE_HTML_UTF16", CL_TYPE_HTML_UTF16 },
{ "CL_TYPE_SCRIPT", CL_TYPE_SCRIPT },
{ "CL_TYPE_RTF", CL_TYPE_RTF },
{ "CL_TYPE_HTML", CL_TYPE_HTML },
{ "CL_TYPE_MAIL", CL_TYPE_MAIL },
{ "CL_TYPE_SFX", CL_TYPE_SFX },
{ "CL_TYPE_ZIPSFX", CL_TYPE_ZIPSFX },
{ "CL_TYPE_RARSFX", CL_TYPE_RARSFX },
{ "CL_TYPE_CABSFX", CL_TYPE_CABSFX },
{ "CL_TYPE_ARJSFX", CL_TYPE_ARJSFX },
{ "CL_TYPE_NULSFT", CL_TYPE_NULSFT },
{ "CL_TYPE_AUTOIT", CL_TYPE_AUTOIT },
{ "CL_TYPE_ISHIELD_MSI", CL_TYPE_ISHIELD_MSI },
{ "CL_TYPE_7Z", CL_TYPE_7Z },
{ "CL_TYPE_7ZSFX", CL_TYPE_7ZSFX },
{ "CL_TYPE_SWF", CL_TYPE_SWF },
{ "CL_TYPE_ISO9660", CL_TYPE_ISO9660 },
{ "CL_TYPE_JAVA", CL_TYPE_JAVA },
{ "CL_TYPE_DMG", CL_TYPE_DMG },
{ "CL_TYPE_MBR", CL_TYPE_MBR },
{ "CL_TYPE_GPT", CL_TYPE_GPT },
{ "CL_TYPE_APM", CL_TYPE_APM },
{ "CL_TYPE_XAR", CL_TYPE_XAR },
{ "CL_TYPE_PART_ANY", CL_TYPE_PART_ANY },
{ "CL_TYPE_PART_HFSPLUS", CL_TYPE_PART_HFSPLUS },
{ "CL_TYPE_XZ", CL_TYPE_XZ },
{ "CL_TYPE_OOXML_WORD", CL_TYPE_OOXML_WORD },
{ "CL_TYPE_OOXML_PPT", CL_TYPE_OOXML_PPT },
{ "CL_TYPE_OOXML_XL", CL_TYPE_OOXML_XL },
{ "CL_TYPE_INTERNAL", CL_TYPE_INTERNAL },
{ "CL_TYPE_XDP", CL_TYPE_XDP },
{ "CL_TYPE_XML_WORD", CL_TYPE_XML_WORD },
{ "CL_TYPE_XML_XL", CL_TYPE_XML_XL },
{ "CL_TYPE_HWP3", CL_TYPE_HWP3 },
{ "CL_TYPE_XML_HWP", CL_TYPE_XML_HWP },
{ "CL_TYPE_HWPOLE2", CL_TYPE_HWPOLE2 },
{ "CL_TYPE_OOXML_HWP", CL_TYPE_OOXML_HWP },
{ "CL_TYPE_PS", CL_TYPE_PS },
{ "CL_TYPE_MHTML", CL_TYPE_MHTML },
{ "CL_TYPE_LNK", CL_TYPE_LNK },
{ "CL_TYPE_EGG", CL_TYPE_EGG },
{ "CL_TYPE_EGGSFX", CL_TYPE_EGGSFX },
{ NULL, CL_TYPE_IGNORED }
};
// clang-format on
cli_file_t cli_ftcode(const char *name)
{
unsigned int i;
for (i = 0; ftmap[i].name; i++)
if (!strcmp(ftmap[i].name, name))
return ftmap[i].code;
return CL_TYPE_ERROR;
}
const char *cli_ftname(cli_file_t code)
{
unsigned int i;
for (i = 0; ftmap[i].name; i++)
if (ftmap[i].code == code)
return ftmap[i].name;
return NULL;
}
void cli_ftfree(const struct cl_engine *engine)
{
struct cli_ftype *ftypes = engine->ftypes, *pt;
while (ftypes) {
pt = ftypes;
ftypes = ftypes->next;
MPOOL_FREE(engine->mempool, pt->magic);
MPOOL_FREE(engine->mempool, pt->tname);
MPOOL_FREE(engine->mempool, pt);
}
2013-09-17 16:45:48 -04:00
ftypes = engine->ptypes;
while (ftypes) {
pt = ftypes;
ftypes = ftypes->next;
MPOOL_FREE(engine->mempool, pt->magic);
MPOOL_FREE(engine->mempool, pt->tname);
MPOOL_FREE(engine->mempool, pt);
2013-09-17 16:45:48 -04:00
}
}
cli_file_t cli_compare_ftm_partition(const unsigned char *buf, size_t buflen, const struct cl_engine *engine)
2013-09-17 16:45:48 -04:00
{
struct cli_ftype *ptype = engine->ptypes;
while (ptype) {
if (ptype->offset + ptype->length <= buflen) {
if (!memcmp(buf + ptype->offset, ptype->magic, ptype->length)) {
cli_dbgmsg("Recognized %s partition\n", ptype->tname);
return ptype->type;
}
}
ptype = ptype->next;
2013-09-17 16:45:48 -04:00
}
cli_dbgmsg("Partition type is potentially unsupported\n");
2013-09-17 16:45:48 -04:00
return CL_TYPE_PART_ANY;
}
cli_file_t cli_compare_ftm_file(const unsigned char *buf, size_t buflen, const struct cl_engine *engine)
{
struct cli_ftype *ftype = engine->ftypes;
while (ftype) {
if (ftype->offset + ftype->length <= buflen) {
if (!memcmp(buf + ftype->offset, ftype->magic, ftype->length)) {
cli_dbgmsg("Recognized %s file\n", ftype->tname);
return ftype->type;
}
}
ftype = ftype->next;
}
return cli_texttype(buf, buflen);
}
2012-01-05 14:16:09 +02:00
int is_tar(const unsigned char *buf, unsigned int nbytes);
/* organize by length, cannot exceed SIZEOF_LOCAL_HEADER */
// clang-format off
const struct ooxml_ftcodes {
const char *entry;
size_t len;
cli_file_t type;
} ooxml_detect[] = {
{ "xl/", 3, CL_TYPE_OOXML_XL },
{ "ppt/", 4, CL_TYPE_OOXML_PPT },
{ "word/", 5, CL_TYPE_OOXML_WORD },
{ "BinData", 7, CL_TYPE_ZIP }, /* HWP */
{ "mimetype", 8, CL_TYPE_ZIP }, /* HWP */
{ "Contents", 8, CL_TYPE_ZIP }, /* HWP */
{ "docProps/", 9, CL_TYPE_ZIP }, /* MS */
{ "customXml/", 10, CL_TYPE_ZIP }, /* MS */
{ "version.xml", 11, CL_TYPE_ZIP }, /* HWP */
{ "settings.xml", 12, CL_TYPE_ZIP }, /* HWP */
{ "_.rels/.rels", 12, CL_TYPE_ZIP }, /* MS */
{ "[ContentTypes].xml", 18, CL_TYPE_ZIP }, /* MS */
{ "[Content_Types].xml", 19, CL_TYPE_ZIP }, /* MS */
{ "Preview/PrvText.txt", 19, CL_TYPE_ZIP }, /* HWP */
{ "Contents/content.hpf", 20, CL_TYPE_OOXML_HWP },
{ "META-INF/container.xml", 22, CL_TYPE_ZIP }, /* HWP */
{ NULL, 0, CL_TYPE_ANY }
};
// clang-format on
2016-01-13 17:16:44 -05:00
/* set to biggest ooxml_detect len */
#define OOXML_DETECT_MAXLEN 22
#define OOXML_FTIDENTIFIED(type) \
do { \
if (type != CL_TYPE_ZIP) { \
switch (type) { \
case CL_TYPE_OOXML_XL: \
cli_dbgmsg("Recognized OOXML XL file\n"); \
return CL_TYPE_OOXML_XL; \
case CL_TYPE_OOXML_PPT: \
cli_dbgmsg("Recognized OOXML PPT file\n"); \
return CL_TYPE_OOXML_PPT; \
case CL_TYPE_OOXML_WORD: \
cli_dbgmsg("Recognized OOXML WORD file\n"); \
return CL_TYPE_OOXML_WORD; \
case CL_TYPE_OOXML_HWP: \
cli_dbgmsg("Recognized OOXML HWP file\n"); \
return CL_TYPE_OOXML_HWP; \
default: \
cli_dbgmsg("unexpected ooxml_filetype return: %i\n", type); \
} \
} \
} while (0)
cli_file_t cli_determine_fmap_type(fmap_t *map, const struct cl_engine *engine, cli_file_t basetype)
{
unsigned char buffer[MAGIC_BUFFER_SIZE];
const unsigned char *buff;
unsigned char *decoded;
int bread, sret;
cli_file_t ret = CL_TYPE_BINARY_DATA;
struct cli_matcher *root;
struct cli_ac_data mdata;
if (!engine) {
cli_errmsg("cli_determine_fmap_type: engine == NULL\n");
return CL_TYPE_ERROR;
}
if (basetype == CL_TYPE_PART_ANY) {
2013-09-17 16:45:48 -04:00
bread = MIN(map->len, CL_PART_MBUFF_SIZE);
} else {
2013-09-17 16:45:48 -04:00
bread = MIN(map->len, CL_FILE_MBUFF_SIZE);
}
if (bread > MAGIC_BUFFER_SIZE) {
2013-09-17 16:45:48 -04:00
/* Save anyone who tampered with the header */
bread = MAGIC_BUFFER_SIZE;
}
buff = fmap_need_off_once(map, 0, bread);
if (buff) {
sret = cli_memcpy(buffer, buff, bread);
if (sret) {
cli_errmsg("cli_determine_fmap_type: fileread error!\n");
return CL_TYPE_ERROR;
}
sret = 0;
} else {
return CL_TYPE_ERROR;
}
2013-09-17 16:45:48 -04:00
if (basetype == CL_TYPE_PART_ANY) { /* typing a partition */
ret = cli_compare_ftm_partition(buff, bread, engine);
} else { /* typing a file */
ret = cli_compare_ftm_file(buff, bread, engine);
2013-09-17 16:45:48 -04:00
if (ret == CL_TYPE_BINARY_DATA) {
switch (is_tar(buff, bread)) {
case 1:
cli_dbgmsg("Recognized old fashioned tar file\n");
return CL_TYPE_OLD_TAR;
case 2:
cli_dbgmsg("Recognized POSIX tar file\n");
return CL_TYPE_POSIX_TAR;
}
} else if (ret == CL_TYPE_ZIP && bread > 2 * (SIZEOF_LOCAL_HEADER + 5)) {
const char lhdr_magic[4] = {0x50, 0x4b, 0x03, 0x04};
const unsigned char *zbuff = buff;
uint32_t zread = bread;
uint64_t zoff = bread;
const unsigned char *znamep = buff;
int32_t zlen = bread;
int lhc = 0;
int zi, i, likely_ooxml = 0;
cli_file_t ret2;
for (zi = 0; zi < 32; zi++) {
znamep = (const unsigned char *)cli_memstr((const char *)znamep, zlen, lhdr_magic, 4);
if (NULL != znamep) {
znamep += SIZEOF_LOCAL_HEADER;
zlen = zread - (znamep - zbuff);
2016-01-13 17:16:44 -05:00
if (zlen > OOXML_DETECT_MAXLEN) {
for (i = 0; ooxml_detect[i].entry; i++) {
2016-01-13 17:16:44 -05:00
if (0 == memcmp(znamep, ooxml_detect[i].entry, ooxml_detect[i].len)) {
if (ooxml_detect[i].type != CL_TYPE_ZIP) {
OOXML_FTIDENTIFIED(ooxml_detect[i].type);
/* returns any unexpected type detection */
return ooxml_detect[i].type;
}
2016-01-13 17:16:44 -05:00
likely_ooxml = 1;
}
}
2016-01-13 17:16:44 -05:00
/* only check first three readable zip headers */
if (++lhc > 2) {
2016-01-13 17:16:44 -05:00
/* if likely, check full archive */
if (likely_ooxml) {
cli_dbgmsg("Likely OOXML, checking additional zip headers\n");
Record names of extracted files A way is needed to record scanned file names for two purposes: 1. File names (and extensions) must be stored in the json metadata properties recorded when using the --gen-json clamscan option. Future work may use this to compare file extensions with detected file types. 2. File names are useful when interpretting tmp directory output when using the --leave-temps option. This commit enables file name retention for later use by storing file names in the fmap header structure, if a file name exists. To store the names in fmaps, an optional name argument has been added to any internal scan API's that create fmaps and every call to these APIs has been modified to pass a file name or NULL if a file name is not required. The zip and gpt parsers required some modification to record file names. The NSIS and XAR parsers fail to collect file names at all and will require future work to support file name extraction. Also: - Added recursive extraction to the tmp directory when the --leave-temps option is enabled. When not enabled, the tmp directory structure remains flat so as to prevent the likelihood of exceeding MAX_PATH. The current tmp directory is stored in the scan context. - Made the cli_scanfile() internal API non-static and added it to scanners.h so it would be accessible outside of scanners.c in order to remove code duplication within libmspack.c. - Added function comments to scanners.h and matcher.h - Converted a TDB-type macros and LSIG-type macros to enums for improved type safey. - Converted more return status variables from `int` to `cl_error_t` for improved type safety, and corrected ooxml file typing functions so they use `cli_file_t` exclusively rather than mixing types with `cl_error_t`. - Restructured the magic_scandesc() function to use goto's for error handling and removed the early_ret_from_magicscan() macro and magic_scandesc_cleanup() function. This makes the code easier to read and made it easier to add the recursive tmp directory cleanup to magic_scandesc(). - Corrected zip, egg, rar filename extraction issues. - Removed use of extra sub-directory layer for zip, egg, and rar file extraction. For Zip, this also involved changing the extracted filenames to be randomly generated rather than using the "zip.###" file name scheme.
2020-03-19 21:23:54 -04:00
if ((ret2 = cli_ooxml_filetype(NULL, map)) != CL_TYPE_ANY) {
/* either an error or retyping has occurred, return error or just CL_TYPE_ZIP? */
OOXML_FTIDENTIFIED(ret2);
2016-01-13 17:16:44 -05:00
/* falls-through to additional filetyping */
}
}
break;
}
} else {
2016-01-13 17:16:44 -05:00
znamep = NULL; /* force to map more */
}
}
if (znamep == NULL) {
if (map->len - zoff > SIZEOF_LOCAL_HEADER) {
zoff -= SIZEOF_LOCAL_HEADER + OOXML_DETECT_MAXLEN + 1; /* remap for SIZEOF_LOCAL_HEADER+filelen for header overlap map boundary */
zread = MIN(MAGIC_BUFFER_SIZE, map->len - zoff);
zbuff = fmap_need_off_once(map, zoff, zread);
if (zbuff == NULL) {
cli_dbgmsg("cli_determine_fmap_type: error mapping data for OOXML check\n");
return CL_TYPE_ERROR;
}
zoff += zread;
znamep = zbuff;
zlen = zread;
} else {
break; /* end of data */
}
}
}
} else if (ret == CL_TYPE_MBR) {
/* given filetype sig type 0 */
int iret = cli_mbr_check(buff, bread, map->len);
if (iret == CL_TYPE_GPT) {
cli_dbgmsg("Recognized GUID Partition Table file\n");
return CL_TYPE_GPT;
} else if (iret == CL_CLEAN) {
return CL_TYPE_MBR;
}
/* re-detect type */
cli_dbgmsg("Recognized binary data\n");
ret = CL_TYPE_BINARY_DATA;
}
}
if (ret >= CL_TYPE_TEXT_ASCII && ret <= CL_TYPE_BINARY_DATA) {
/* HTML files may contain special characters and could be
* misidentified as BINARY_DATA by cli_compare_ftm_file()
Record names of extracted files A way is needed to record scanned file names for two purposes: 1. File names (and extensions) must be stored in the json metadata properties recorded when using the --gen-json clamscan option. Future work may use this to compare file extensions with detected file types. 2. File names are useful when interpretting tmp directory output when using the --leave-temps option. This commit enables file name retention for later use by storing file names in the fmap header structure, if a file name exists. To store the names in fmaps, an optional name argument has been added to any internal scan API's that create fmaps and every call to these APIs has been modified to pass a file name or NULL if a file name is not required. The zip and gpt parsers required some modification to record file names. The NSIS and XAR parsers fail to collect file names at all and will require future work to support file name extraction. Also: - Added recursive extraction to the tmp directory when the --leave-temps option is enabled. When not enabled, the tmp directory structure remains flat so as to prevent the likelihood of exceeding MAX_PATH. The current tmp directory is stored in the scan context. - Made the cli_scanfile() internal API non-static and added it to scanners.h so it would be accessible outside of scanners.c in order to remove code duplication within libmspack.c. - Added function comments to scanners.h and matcher.h - Converted a TDB-type macros and LSIG-type macros to enums for improved type safey. - Converted more return status variables from `int` to `cl_error_t` for improved type safety, and corrected ooxml file typing functions so they use `cli_file_t` exclusively rather than mixing types with `cl_error_t`. - Restructured the magic_scandesc() function to use goto's for error handling and removed the early_ret_from_magicscan() macro and magic_scandesc_cleanup() function. This makes the code easier to read and made it easier to add the recursive tmp directory cleanup to magic_scandesc(). - Corrected zip, egg, rar filename extraction issues. - Removed use of extra sub-directory layer for zip, egg, and rar file extraction. For Zip, this also involved changing the extracted filenames to be randomly generated rather than using the "zip.###" file name scheme.
2020-03-19 21:23:54 -04:00
*/
root = engine->root[0];
if (!root)
return ret;
if (cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, root->ac_reloff_num, CLI_DEFAULT_AC_TRACKLEN))
return ret;
sret = cli_ac_scanbuff(buff, bread, NULL, NULL, NULL, engine->root[0], &mdata, 0, ret, NULL, AC_SCAN_FT, NULL);
cli_ac_freedata(&mdata);
if (sret >= CL_TYPENO) {
ret = sret;
} else {
if (cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, root->ac_reloff_num, CLI_DEFAULT_AC_TRACKLEN))
return ret;
decoded = (unsigned char *)cli_utf16toascii((char *)buff, bread);
if (decoded) {
sret = cli_ac_scanbuff(decoded, bread / 2, NULL, NULL, NULL, engine->root[0], &mdata, 0, CL_TYPE_TEXT_ASCII, NULL, AC_SCAN_FT, NULL);
free(decoded);
if (sret == CL_TYPE_HTML)
ret = CL_TYPE_HTML_UTF16;
}
cli_ac_freedata(&mdata);
if ((((struct cli_dconf *)engine->dconf)->phishing & PHISHING_CONF_ENTCONV) && ret != CL_TYPE_HTML_UTF16) {
const char *encoding;
/* check if we can autodetect this encoding.
Record names of extracted files A way is needed to record scanned file names for two purposes: 1. File names (and extensions) must be stored in the json metadata properties recorded when using the --gen-json clamscan option. Future work may use this to compare file extensions with detected file types. 2. File names are useful when interpretting tmp directory output when using the --leave-temps option. This commit enables file name retention for later use by storing file names in the fmap header structure, if a file name exists. To store the names in fmaps, an optional name argument has been added to any internal scan API's that create fmaps and every call to these APIs has been modified to pass a file name or NULL if a file name is not required. The zip and gpt parsers required some modification to record file names. The NSIS and XAR parsers fail to collect file names at all and will require future work to support file name extraction. Also: - Added recursive extraction to the tmp directory when the --leave-temps option is enabled. When not enabled, the tmp directory structure remains flat so as to prevent the likelihood of exceeding MAX_PATH. The current tmp directory is stored in the scan context. - Made the cli_scanfile() internal API non-static and added it to scanners.h so it would be accessible outside of scanners.c in order to remove code duplication within libmspack.c. - Added function comments to scanners.h and matcher.h - Converted a TDB-type macros and LSIG-type macros to enums for improved type safey. - Converted more return status variables from `int` to `cl_error_t` for improved type safety, and corrected ooxml file typing functions so they use `cli_file_t` exclusively rather than mixing types with `cl_error_t`. - Restructured the magic_scandesc() function to use goto's for error handling and removed the early_ret_from_magicscan() macro and magic_scandesc_cleanup() function. This makes the code easier to read and made it easier to add the recursive tmp directory cleanup to magic_scandesc(). - Corrected zip, egg, rar filename extraction issues. - Removed use of extra sub-directory layer for zip, egg, and rar file extraction. For Zip, this also involved changing the extracted filenames to be randomly generated rather than using the "zip.###" file name scheme.
2020-03-19 21:23:54 -04:00
* If we can't don't try to detect HTML sig, since
* we just tried that above, and failed */
if ((encoding = encoding_detect_bom(buff, bread))) {
unsigned char decodedbuff[(MAGIC_BUFFER_SIZE + 1) * 2];
m_area_t in_area, out_area;
memset(decodedbuff, 0, sizeof(decodedbuff));
in_area.buffer = (unsigned char *)buff;
in_area.length = bread;
in_area.offset = 0;
out_area.buffer = decodedbuff;
out_area.length = sizeof(decodedbuff);
out_area.offset = 0;
/* in htmlnorm we simply skip over \0 chars, allowing HTML parsing in any unicode
Record names of extracted files A way is needed to record scanned file names for two purposes: 1. File names (and extensions) must be stored in the json metadata properties recorded when using the --gen-json clamscan option. Future work may use this to compare file extensions with detected file types. 2. File names are useful when interpretting tmp directory output when using the --leave-temps option. This commit enables file name retention for later use by storing file names in the fmap header structure, if a file name exists. To store the names in fmaps, an optional name argument has been added to any internal scan API's that create fmaps and every call to these APIs has been modified to pass a file name or NULL if a file name is not required. The zip and gpt parsers required some modification to record file names. The NSIS and XAR parsers fail to collect file names at all and will require future work to support file name extraction. Also: - Added recursive extraction to the tmp directory when the --leave-temps option is enabled. When not enabled, the tmp directory structure remains flat so as to prevent the likelihood of exceeding MAX_PATH. The current tmp directory is stored in the scan context. - Made the cli_scanfile() internal API non-static and added it to scanners.h so it would be accessible outside of scanners.c in order to remove code duplication within libmspack.c. - Added function comments to scanners.h and matcher.h - Converted a TDB-type macros and LSIG-type macros to enums for improved type safey. - Converted more return status variables from `int` to `cl_error_t` for improved type safety, and corrected ooxml file typing functions so they use `cli_file_t` exclusively rather than mixing types with `cl_error_t`. - Restructured the magic_scandesc() function to use goto's for error handling and removed the early_ret_from_magicscan() macro and magic_scandesc_cleanup() function. This makes the code easier to read and made it easier to add the recursive tmp directory cleanup to magic_scandesc(). - Corrected zip, egg, rar filename extraction issues. - Removed use of extra sub-directory layer for zip, egg, and rar file extraction. For Zip, this also involved changing the extracted filenames to be randomly generated rather than using the "zip.###" file name scheme.
2020-03-19 21:23:54 -04:00
* (multibyte characters will not be exactly handled, but that is not a problem).
* However when detecting whether a file is HTML or not, we need exact conversion.
* (just eliminating zeros and matching would introduce false positives */
if (encoding_normalize_toascii(&in_area, encoding, &out_area) >= 0 && out_area.length > 0) {
if (cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, root->ac_reloff_num, CLI_DEFAULT_AC_TRACKLEN))
return ret;
if (out_area.length > 0) {
sret = cli_ac_scanbuff(decodedbuff, out_area.length, NULL, NULL, NULL, engine->root[0], &mdata, 0, 0, NULL, AC_SCAN_FT, NULL); /* FIXME: can we use CL_TYPE_TEXT_ASCII instead of 0? */
if (sret == CL_TYPE_HTML) {
cli_dbgmsg("cli_determine_fmap_type: detected HTML signature in Unicode file\n");
/* htmlnorm is able to handle any unicode now, since it skips null chars */
ret = CL_TYPE_HTML;
}
}
cli_ac_freedata(&mdata);
}
}
}
}
}
return ret;
}