2009-08-19 20:42:40 +02:00
|
|
|
|
/*
|
2025-02-14 10:24:30 -05:00
|
|
|
|
* Copyright (C) 2013-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
|
2013-11-26 13:51:54 -05:00
|
|
|
|
* Copyright (C) 2009-2013 Sourcefire, Inc.
|
2009-08-19 20:42:40 +02:00
|
|
|
|
*
|
|
|
|
|
* Authors: aCaB <acab@clamav.net>
|
|
|
|
|
*
|
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
|
|
|
* MA 02110-1301, USA.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* an mmap "replacement" which doesn't suck */
|
|
|
|
|
|
|
|
|
|
#if HAVE_CONFIG_H
|
|
|
|
|
#include "clamav-config.h"
|
|
|
|
|
#endif
|
|
|
|
|
|
2018-07-30 20:19:28 -04:00
|
|
|
|
#include <stdint.h>
|
2009-08-19 20:42:40 +02:00
|
|
|
|
#include <sys/types.h>
|
|
|
|
|
#include <sys/stat.h>
|
2009-08-20 01:34:03 +02:00
|
|
|
|
#include <string.h>
|
2018-07-30 20:19:28 -04:00
|
|
|
|
#include <libgen.h>
|
2009-10-12 23:32:27 +02:00
|
|
|
|
#ifdef HAVE_UNISTD_H
|
2009-08-19 20:42:40 +02:00
|
|
|
|
#include <unistd.h>
|
2009-10-12 23:32:27 +02:00
|
|
|
|
#endif
|
2010-05-15 22:32:44 +02:00
|
|
|
|
#ifdef ANONYMOUS_MAP
|
2009-09-07 11:15:18 +02:00
|
|
|
|
#ifdef HAVE_SYS_MMAN_H
|
2009-08-20 02:19:57 +02:00
|
|
|
|
#include <sys/mman.h>
|
2009-09-07 11:15:18 +02:00
|
|
|
|
#endif
|
|
|
|
|
#endif
|
2010-05-04 19:38:43 +02:00
|
|
|
|
#include <errno.h>
|
2009-08-19 20:42:40 +02:00
|
|
|
|
|
2009-10-10 15:39:31 +02:00
|
|
|
|
#ifdef C_LINUX
|
2009-09-10 03:20:02 +02:00
|
|
|
|
#include <pthread.h>
|
2009-10-10 15:39:31 +02:00
|
|
|
|
#endif
|
2009-09-10 03:20:02 +02:00
|
|
|
|
|
2014-07-01 19:38:01 -04:00
|
|
|
|
#include "clamav.h"
|
2009-08-19 20:42:40 +02:00
|
|
|
|
#include "others.h"
|
2018-07-30 20:19:28 -04:00
|
|
|
|
#include "str.h"
|
2009-08-20 01:07:33 +02:00
|
|
|
|
|
2020-02-28 18:29:35 -05:00
|
|
|
|
#define FM_MASK_COUNT 0x3fffffff
|
|
|
|
|
#define FM_MASK_PAGED 0x40000000
|
|
|
|
|
#define FM_MASK_SEEN 0x80000000
|
|
|
|
|
#define FM_MASK_LOCKED FM_MASK_SEEN
|
|
|
|
|
/* 2 high bits:
|
|
|
|
|
00 - not seen - not paged - N/A
|
|
|
|
|
01 - N/A - paged - not locked
|
|
|
|
|
10 - seen - not paged - N/A
|
|
|
|
|
11 - N/A - paged - locked
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* FIXME: tune this stuff */
|
|
|
|
|
#define UNPAGE_THRSHLD_LO 4 * 1024 * 1024
|
|
|
|
|
#define UNPAGE_THRSHLD_HI 8 * 1024 * 1024
|
|
|
|
|
#define READAHEAD_PAGES 8
|
|
|
|
|
|
|
|
|
|
#if defined(ANONYMOUS_MAP) && defined(C_LINUX) && defined(CL_THREAD_SAFE)
|
|
|
|
|
/*
|
|
|
|
|
WORKAROUND
|
|
|
|
|
Relieve some stress on mmap_sem.
|
|
|
|
|
When mmap_sem is heavily hammered, the scheduler
|
|
|
|
|
tends to fail to wake us up properly.
|
|
|
|
|
*/
|
|
|
|
|
pthread_mutex_t fmap_mutex = PTHREAD_MUTEX_INITIALIZER;
|
|
|
|
|
#define fmap_lock pthread_mutex_lock(&fmap_mutex)
|
|
|
|
|
#define fmap_unlock pthread_mutex_unlock(&fmap_mutex);
|
|
|
|
|
#else
|
|
|
|
|
#define fmap_lock
|
|
|
|
|
#define fmap_unlock
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#ifndef MADV_DONTFORK
|
|
|
|
|
#define MADV_DONTFORK 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#define fmap_bitmap (m->bitmap)
|
|
|
|
|
|
2021-03-31 12:16:41 -07:00
|
|
|
|
static inline uint64_t fmap_align_items(uint64_t sz, uint64_t al);
|
|
|
|
|
static inline uint64_t fmap_align_to(uint64_t sz, uint64_t al);
|
|
|
|
|
static inline uint64_t fmap_which_page(fmap_t *m, size_t at);
|
2009-10-10 19:10:15 +02:00
|
|
|
|
|
2020-02-28 18:29:35 -05:00
|
|
|
|
static const void *handle_need(fmap_t *m, size_t at, size_t len, int lock);
|
|
|
|
|
static void handle_unneed_off(fmap_t *m, size_t at, size_t len);
|
|
|
|
|
static const void *handle_need_offstr(fmap_t *m, size_t at, size_t len_hint);
|
|
|
|
|
static const void *handle_gets(fmap_t *m, char *dst, size_t *at, size_t max_len);
|
|
|
|
|
|
|
|
|
|
static void unmap_mmap(fmap_t *m);
|
|
|
|
|
static void unmap_malloc(fmap_t *m);
|
|
|
|
|
|
2009-10-10 20:46:05 +02:00
|
|
|
|
#ifndef _WIN32
|
2012-01-10 23:07:29 +02:00
|
|
|
|
/* pread proto here in order to avoid the use of XOPEN and BSD_SOURCE
|
|
|
|
|
which may in turn prevent some mmap constants to be defined */
|
|
|
|
|
ssize_t pread(int fd, void *buf, size_t count, off_t offset);
|
|
|
|
|
|
2009-10-10 19:10:15 +02:00
|
|
|
|
/* vvvvv POSIX STUFF BELOW vvvvv */
|
2012-01-09 12:43:21 +02:00
|
|
|
|
static off_t pread_cb(void *handle, void *buf, size_t count, off_t offset)
|
|
|
|
|
{
|
|
|
|
|
return pread((int)(ssize_t)handle, buf, count, offset);
|
|
|
|
|
}
|
|
|
|
|
|
2025-06-08 01:12:33 -04:00
|
|
|
|
fmap_t *fmap_check_empty(int fd, off_t offset, size_t len, int *empty, const char *name, const char *path)
|
2018-12-03 12:40:13 -05:00
|
|
|
|
{
|
2012-07-16 15:36:49 -04:00
|
|
|
|
STATBUF st;
|
2021-01-23 16:41:41 -08:00
|
|
|
|
fmap_t *m = NULL;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
|
|
|
|
|
*empty = 0;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (FSTAT(fd, &st)) {
|
|
|
|
|
cli_warnmsg("fmap: fstat failed\n");
|
|
|
|
|
return NULL;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
}
|
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (!len) len = st.st_size - offset; /* bound checked later */
|
|
|
|
|
if (!len) {
|
|
|
|
|
cli_dbgmsg("fmap: attempted void mapping\n");
|
|
|
|
|
*empty = 1;
|
|
|
|
|
return NULL;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
}
|
2021-10-28 16:58:21 -07:00
|
|
|
|
if (!CLI_ISCONTAINED_0_TO(st.st_size, offset, len)) {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
cli_warnmsg("fmap: attempted oof mapping\n");
|
|
|
|
|
return NULL;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
}
|
2018-12-03 12:40:13 -05:00
|
|
|
|
m = cl_fmap_open_handle((void *)(ssize_t)fd, offset, len, pread_cb, 1);
|
2011-06-14 21:26:59 +03:00
|
|
|
|
if (!m)
|
2018-12-03 12:40:13 -05:00
|
|
|
|
return NULL;
|
2023-12-06 17:33:21 -05:00
|
|
|
|
m->mtime = (uint64_t)st.st_mtime;
|
2020-02-23 12:38:18 -05:00
|
|
|
|
|
2020-03-19 21:23:54 -04:00
|
|
|
|
if (NULL != name) {
|
2024-01-09 17:44:33 -05:00
|
|
|
|
m->name = cli_safer_strdup(name);
|
2020-03-19 21:23:54 -04:00
|
|
|
|
if (NULL == m->name) {
|
2025-06-08 01:12:33 -04:00
|
|
|
|
fmap_free(m);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (NULL != path) {
|
|
|
|
|
m->path = cli_safer_strdup(path);
|
|
|
|
|
if (NULL == m->path) {
|
|
|
|
|
fmap_free(m);
|
2020-03-19 21:23:54 -04:00
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-06-14 21:26:59 +03:00
|
|
|
|
return m;
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
/* vvvvv WIN32 STUFF BELOW vvvvv */
|
2018-12-03 12:40:13 -05:00
|
|
|
|
static void unmap_win32(fmap_t *m)
|
2020-02-28 18:29:35 -05:00
|
|
|
|
{
|
|
|
|
|
if (NULL != m) {
|
|
|
|
|
if (NULL != m->data) {
|
|
|
|
|
UnmapViewOfFile(m->data);
|
|
|
|
|
}
|
2023-12-06 17:33:21 -05:00
|
|
|
|
if (NULL != m->windows_map_handle) {
|
|
|
|
|
CloseHandle(m->windows_map_handle);
|
2020-02-28 18:29:35 -05:00
|
|
|
|
}
|
2020-03-19 21:23:54 -04:00
|
|
|
|
if (NULL != m->name) {
|
|
|
|
|
free(m->name);
|
|
|
|
|
}
|
2025-06-08 01:12:33 -04:00
|
|
|
|
if (NULL != m->path) {
|
|
|
|
|
free(m->path);
|
|
|
|
|
}
|
2020-02-28 18:29:35 -05:00
|
|
|
|
free((void *)m);
|
|
|
|
|
}
|
2011-06-14 21:26:59 +03:00
|
|
|
|
}
|
|
|
|
|
|
2025-06-08 01:12:33 -04:00
|
|
|
|
fmap_t *fmap_check_empty(int fd, off_t offset, size_t len, int *empty, const char *name, const char *path)
|
2018-12-03 12:40:13 -05:00
|
|
|
|
{ /* WIN32 */
|
2021-03-31 12:16:41 -07:00
|
|
|
|
uint64_t pages, mapsz;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
int pgsz = cli_getpagesize();
|
2012-07-16 15:36:49 -04:00
|
|
|
|
STATBUF st;
|
2020-02-28 18:29:35 -05:00
|
|
|
|
fmap_t *m = NULL;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
const void *data;
|
2023-12-06 17:33:21 -05:00
|
|
|
|
HANDLE windows_file_handle;
|
|
|
|
|
HANDLE windows_map_handle;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
|
|
|
|
|
*empty = 0;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (FSTAT(fd, &st)) {
|
|
|
|
|
cli_warnmsg("fmap: fstat failed\n");
|
|
|
|
|
return NULL;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
}
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (offset < 0 || offset != fmap_align_to(offset, pgsz)) {
|
|
|
|
|
cli_warnmsg("fmap: attempted mapping with unaligned offset\n");
|
|
|
|
|
return NULL;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
}
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (!len) len = st.st_size - offset; /* bound checked later */
|
|
|
|
|
if (!len) {
|
|
|
|
|
cli_dbgmsg("fmap: attempted void mapping\n");
|
|
|
|
|
*empty = 1;
|
|
|
|
|
return NULL;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
}
|
2021-10-28 16:58:21 -07:00
|
|
|
|
if (!CLI_ISCONTAINED_0_TO(st.st_size, offset, len)) {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
cli_warnmsg("fmap: attempted oof mapping\n");
|
|
|
|
|
return NULL;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pages = fmap_align_items(len, pgsz);
|
|
|
|
|
|
2023-12-06 17:33:21 -05:00
|
|
|
|
if ((windows_file_handle = (HANDLE)_get_osfhandle(fd)) == INVALID_HANDLE_VALUE) {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
cli_errmsg("fmap: cannot get a valid handle for descriptor %d\n", fd);
|
|
|
|
|
return NULL;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
}
|
2023-12-06 17:33:21 -05:00
|
|
|
|
if (!(windows_map_handle = CreateFileMapping(windows_file_handle, NULL, PAGE_READONLY, (DWORD)((len >> 31) >> 1), (DWORD)len, NULL))) {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
cli_errmsg("fmap: cannot create a map of descriptor %d\n", fd);
|
|
|
|
|
return NULL;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
}
|
2023-12-06 17:33:21 -05:00
|
|
|
|
if (!(data = MapViewOfFile(windows_map_handle, FILE_MAP_READ, (DWORD)((offset >> 31) >> 1), (DWORD)(offset), len))) {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
cli_errmsg("fmap: cannot map file descriptor %d\n", fd);
|
2023-12-06 17:33:21 -05:00
|
|
|
|
CloseHandle(windows_map_handle);
|
2018-12-03 12:40:13 -05:00
|
|
|
|
return NULL;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
}
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (!(m = cl_fmap_open_memory(data, len))) {
|
|
|
|
|
cli_errmsg("fmap: cannot allocate fmap_t\n", fd);
|
2020-02-28 18:29:35 -05:00
|
|
|
|
UnmapViewOfFile(data);
|
2023-12-06 17:33:21 -05:00
|
|
|
|
CloseHandle(windows_map_handle);
|
2018-12-03 12:40:13 -05:00
|
|
|
|
return NULL;
|
2011-06-14 21:26:59 +03:00
|
|
|
|
}
|
2023-12-06 17:33:21 -05:00
|
|
|
|
m->handle = (void *)(size_t)fd;
|
libclamav: scan-layer callback API functions
Add the following scan callbacks:
```c
cl_engine_set_scan_callback(engine, &pre_hash_callback, CL_SCAN_CALLBACK_PRE_HASH);
cl_engine_set_scan_callback(engine, &pre_scan_callback, CL_SCAN_CALLBACK_PRE_SCAN);
cl_engine_set_scan_callback(engine, &post_scan_callback, CL_SCAN_CALLBACK_POST_SCAN);
cl_engine_set_scan_callback(engine, &alert_callback, CL_SCAN_CALLBACK_ALERT);
cl_engine_set_scan_callback(engine, &file_type_callback, CL_SCAN_CALLBACK_FILE_TYPE);
```
Each callback may alter scan behavior using the following return codes:
* CL_BREAK
Scan aborted by callback (the rest of the scan is skipped).
This does not mark the file as clean or infected, it just skips the rest of the scan.
* CL_SUCCESS / CL_CLEAN
File scan will continue.
This is different than CL_VERIFIED because it does not affect prior or future alerts.
Return CL_VERIFIED instead if you want to remove prior alerts for this layer and skip
the rest of the scan for this layer.
* CL_VIRUS
This means you don't trust the file. A new alert will be added.
For CL_SCAN_CALLBACK_ALERT: Means you agree with the alert (no extra alert needed).
* CL_VERIFIED
Layer explicitly trusted by the callback and previous alerts removed FOR THIS layer.
You might want to do this if you trust the hash or verified a digital signature.
The rest of the scan will be skipped FOR THIS layer.
For contained files, this does NOT mean that the parent or adjacent layers are trusted.
Each callback is given a pointer to the current scan layer from which
they can get previous layers, can get the the layer's fmap, and then
various attributes of the layer and of the fmap such as:
- layer recursion level
- layer object id
- layer file type
- layer attributes (was decerypted, normalized, embedded, or re-typed)
- layer last alert
- fmap name
- fmap hash (md5, sha1, or sha2-256)
- fmap data (pointer and size)
- fmap file descriptor, if any (fd, offset, size)
- fmap filepath, if any (filepath, offset, size)
To make this possible, this commits introduced a handful of new APIs to
query scan-layer details and fmap details:
- `cl_error_t cl_fmap_set_name(cl_fmap_t *map, const char *name);`
- `cl_error_t cl_fmap_get_name(cl_fmap_t *map, const char **name_out);`
- `cl_error_t cl_fmap_set_path(cl_fmap_t *map, const char *path);`
- `cl_error_t cl_fmap_get_path(cl_fmap_t *map, const char **path_out, size_t *offset_out, size_t *len_out);`
- `cl_error_t cl_fmap_get_fd(const cl_fmap_t *map, int *fd_out, size_t *offset_out, size_t *len_out);`
- `cl_error_t cl_fmap_get_size(const cl_fmap_t *map, size_t *size_out);`
- `cl_error_t cl_fmap_set_hash(const cl_fmap_t *map, const char *hash_alg, char hash);`
- `cl_error_t cl_fmap_have_hash(const cl_fmap_t *map, const char *hash_alg, bool *have_hash_out);`
- `cl_error_t cl_fmap_will_need_hash_later(const cl_fmap_t *map, const char *hash_alg);`
- `cl_error_t cl_fmap_get_hash(const cl_fmap_t *map, const char *hash_alg, const char **hash_out);`
- `cl_error_t cl_fmap_get_data(const cl_fmap_t *map, size_t offset, size_t len, const uint8_t **data_out, size_t *data_len_out);`
- `cl_error_t cl_scan_layer_get_fmap(cl_scan_layer_t *layer, cl_fmap_t **fmap_out);`
- `cl_error_t cl_scan_layer_get_parent_layer(cl_scan_layer_t *layer, cl_scan_layer_t **parent_layer_out);`
- `cl_error_t cl_scan_layer_get_type(cl_scan_layer_t *layer, const char **type_out);`
- `cl_error_t cl_scan_layer_get_recursion_level(cl_scan_layer_t *layer, uint32_t *recursion_level_out);`
- `cl_error_t cl_scan_layer_get_object_id(cl_scan_layer_t *layer, uint64_t *object_id_out);`
- `cl_error_t cl_scan_layer_get_last_alert(cl_scan_layer_t *layer, const char **alert_name_out);`
- `cl_error_t cl_scan_layer_get_attributes(cl_scan_layer_t *layer, uint32_t *attributes_out);`
This commit deprecates but does not remove the existing scan callbacks:
- `void cl_engine_set_clcb_pre_cache(struct cl_engine *engine, clcb_pre_cache callback);`
- `void cl_engine_set_clcb_file_inspection(struct cl_engine *engine, clcb_file_inspection callback);`
- `void cl_engine_set_clcb_pre_scan(struct cl_engine *engine, clcb_pre_scan callback);`
- `void cl_engine_set_clcb_post_scan(struct cl_engine *engine, clcb_post_scan callback);`
- `void cl_engine_set_clcb_virus_found(struct cl_engine *engine, clcb_virus_found callback);`
- `void cl_engine_set_clcb_hash(struct cl_engine *engine, clcb_hash callback);`
This commit also adds an interactive test program to demonstrate the callbacks.
See: `examples/ex_scan_callbacks.c`
CLAM-255
CLAM-2485
CLAM-2626
2025-06-22 14:37:03 -04:00
|
|
|
|
m->handle_is_fd = true;
|
2023-12-06 17:33:21 -05:00
|
|
|
|
m->windows_file_handle = (void *)windows_file_handle;
|
|
|
|
|
m->windows_map_handle = (void *)windows_map_handle;
|
|
|
|
|
m->unmap = unmap_win32;
|
2020-02-23 12:38:18 -05:00
|
|
|
|
|
2020-03-19 21:23:54 -04:00
|
|
|
|
if (NULL != name) {
|
2024-01-09 17:44:33 -05:00
|
|
|
|
m->name = cli_safer_strdup(name);
|
2020-03-19 21:23:54 -04:00
|
|
|
|
if (NULL == m->name) {
|
2025-06-08 01:12:33 -04:00
|
|
|
|
fmap_free(m);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (NULL != path) {
|
|
|
|
|
m->path = cli_safer_strdup(path);
|
|
|
|
|
if (NULL == m->path) {
|
|
|
|
|
fmap_free(m);
|
2020-03-19 21:23:54 -04:00
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-06-14 21:26:59 +03:00
|
|
|
|
return m;
|
|
|
|
|
}
|
|
|
|
|
#endif /* _WIN32 */
|
|
|
|
|
|
|
|
|
|
/* vvvvv SHARED STUFF BELOW vvvvv */
|
2009-10-10 19:10:15 +02:00
|
|
|
|
|
2021-01-23 16:41:41 -08:00
|
|
|
|
fmap_t *fmap_duplicate(cl_fmap_t *map, size_t offset, size_t length, const char *name)
|
2020-02-28 18:29:35 -05:00
|
|
|
|
{
|
|
|
|
|
cl_error_t status = CL_ERROR;
|
|
|
|
|
cl_fmap_t *duplicate_map = NULL;
|
2021-01-23 16:41:41 -08:00
|
|
|
|
|
|
|
|
|
if (NULL == map) {
|
|
|
|
|
cli_warnmsg("fmap_duplicate: map is NULL!\n");
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
2020-02-28 18:29:35 -05:00
|
|
|
|
|
2022-05-08 14:59:09 -07:00
|
|
|
|
duplicate_map = malloc(sizeof(cl_fmap_t));
|
2020-02-28 18:29:35 -05:00
|
|
|
|
if (!duplicate_map) {
|
|
|
|
|
cli_warnmsg("fmap_duplicate: map allocation failed\n");
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
2009-08-23 20:06:39 +02:00
|
|
|
|
|
2020-02-28 18:29:35 -05:00
|
|
|
|
/* Duplicate the state of the original map */
|
|
|
|
|
memcpy(duplicate_map, map, sizeof(cl_fmap_t));
|
2025-09-26 18:26:00 -04:00
|
|
|
|
/* Clear the pointers that need to be unique pointers. */
|
|
|
|
|
duplicate_map->name = NULL;
|
|
|
|
|
duplicate_map->path = NULL;
|
2020-02-28 18:29:35 -05:00
|
|
|
|
|
2021-01-23 16:41:41 -08:00
|
|
|
|
if (offset > map->len) {
|
|
|
|
|
/* invalid offset, exceeds length of map */
|
|
|
|
|
cli_warnmsg("fmap_duplicate: requested offset exceeds end of map\n");
|
|
|
|
|
goto done;
|
2020-02-28 18:29:35 -05:00
|
|
|
|
}
|
2009-09-10 03:20:02 +02:00
|
|
|
|
|
2021-01-23 16:41:41 -08:00
|
|
|
|
if (offset > 0 || length < map->len) {
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
/*
|
|
|
|
|
* Caller requested a window into the current map, not the whole map.
|
|
|
|
|
*/
|
2021-01-23 16:41:41 -08:00
|
|
|
|
|
|
|
|
|
/* Set the new nested offset and (nested) length for the new map */
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
/* Note: We can't change offset because then we'd have to discard/move cached
|
2021-01-23 16:41:41 -08:00
|
|
|
|
* data, instead use nested_offset to reuse the already cached data */
|
|
|
|
|
duplicate_map->nested_offset += offset;
|
|
|
|
|
duplicate_map->len = MIN(length, map->len - offset);
|
|
|
|
|
|
2021-10-03 14:13:55 -07:00
|
|
|
|
/* The real_len is the nested_offset + the len of the nested fmap.
|
|
|
|
|
real_len is mostly just a shorthand for when doing bounds checking.
|
|
|
|
|
We do not need to keep track of the original length of the OG fmap */
|
|
|
|
|
duplicate_map->real_len = duplicate_map->nested_offset + duplicate_map->len;
|
|
|
|
|
|
2021-10-28 16:58:21 -07:00
|
|
|
|
if (!CLI_ISCONTAINED_2(map->nested_offset, map->len,
|
|
|
|
|
duplicate_map->nested_offset, duplicate_map->len)) {
|
2021-01-23 16:41:41 -08:00
|
|
|
|
size_t len1, len2;
|
|
|
|
|
len1 = map->nested_offset + map->len;
|
|
|
|
|
len2 = duplicate_map->nested_offset + duplicate_map->len;
|
|
|
|
|
cli_warnmsg("fmap_duplicate: internal map error: %zu, %zu; %zu, %zu\n",
|
|
|
|
|
map->nested_offset, len1,
|
2021-10-03 14:13:55 -07:00
|
|
|
|
duplicate_map->nested_offset, len2);
|
2021-01-23 16:41:41 -08:00
|
|
|
|
}
|
|
|
|
|
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
/* This also means the hash will be different.
|
2022-08-18 20:00:33 -07:00
|
|
|
|
* Clear the have_<hash> flags.
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
* It will be calculated when next it is needed. */
|
2025-06-03 19:03:20 -04:00
|
|
|
|
memset(duplicate_map->will_need_hash, 0, sizeof(duplicate_map->will_need_hash));
|
|
|
|
|
memset(duplicate_map->have_hash, 0, sizeof(duplicate_map->have_hash));
|
2020-02-28 18:29:35 -05:00
|
|
|
|
}
|
2009-10-10 15:39:31 +02:00
|
|
|
|
|
2020-03-19 21:23:54 -04:00
|
|
|
|
if (NULL != name) {
|
2024-01-09 17:44:33 -05:00
|
|
|
|
duplicate_map->name = cli_safer_strdup(name);
|
2020-04-08 16:04:20 -04:00
|
|
|
|
if (NULL == duplicate_map->name) {
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
status = CL_EMEM;
|
|
|
|
|
goto done;
|
2020-03-19 21:23:54 -04:00
|
|
|
|
}
|
|
|
|
|
} else {
|
2020-04-08 16:04:20 -04:00
|
|
|
|
duplicate_map->name = NULL;
|
2020-03-19 21:23:54 -04:00
|
|
|
|
}
|
|
|
|
|
|
2025-06-08 01:12:33 -04:00
|
|
|
|
/* Duplicate the path if it exists */
|
|
|
|
|
if (NULL != map->path) {
|
|
|
|
|
duplicate_map->path = cli_safer_strdup(map->path);
|
|
|
|
|
if (NULL == duplicate_map->path) {
|
|
|
|
|
status = CL_EMEM;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
duplicate_map->path = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2020-02-28 18:29:35 -05:00
|
|
|
|
status = CL_SUCCESS;
|
2009-10-10 16:37:55 +02:00
|
|
|
|
|
2020-02-28 18:29:35 -05:00
|
|
|
|
done:
|
|
|
|
|
if (CL_SUCCESS != status) {
|
|
|
|
|
if (NULL != duplicate_map) {
|
2025-09-26 18:26:00 -04:00
|
|
|
|
if (NULL != duplicate_map->name) {
|
|
|
|
|
free(duplicate_map->name);
|
|
|
|
|
duplicate_map->name = NULL;
|
|
|
|
|
}
|
|
|
|
|
if (NULL != duplicate_map->path) {
|
|
|
|
|
free(duplicate_map->path);
|
|
|
|
|
duplicate_map->path = NULL;
|
|
|
|
|
}
|
2020-02-28 18:29:35 -05:00
|
|
|
|
free(duplicate_map);
|
|
|
|
|
duplicate_map = NULL;
|
|
|
|
|
}
|
|
|
|
|
}
|
2009-08-27 12:08:54 +02:00
|
|
|
|
|
2020-02-28 18:29:35 -05:00
|
|
|
|
return duplicate_map;
|
|
|
|
|
}
|
|
|
|
|
|
2020-03-19 21:23:54 -04:00
|
|
|
|
void free_duplicate_fmap(cl_fmap_t *map)
|
|
|
|
|
{
|
|
|
|
|
if (NULL != map) {
|
|
|
|
|
if (NULL != map->name) {
|
|
|
|
|
free(map->name);
|
|
|
|
|
map->name = NULL;
|
|
|
|
|
}
|
2025-06-08 01:12:33 -04:00
|
|
|
|
if (NULL != map->path) {
|
|
|
|
|
free(map->path);
|
|
|
|
|
map->path = NULL;
|
|
|
|
|
}
|
2020-03-19 21:23:54 -04:00
|
|
|
|
free(map);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-02-28 18:29:35 -05:00
|
|
|
|
static void unmap_handle(fmap_t *m)
|
|
|
|
|
{
|
|
|
|
|
if (NULL != m) {
|
|
|
|
|
if (NULL != m->data) {
|
|
|
|
|
if (m->aging) {
|
|
|
|
|
unmap_mmap(m);
|
|
|
|
|
} else {
|
|
|
|
|
free((void *)m->data);
|
|
|
|
|
}
|
|
|
|
|
m->data = NULL;
|
|
|
|
|
}
|
|
|
|
|
if (NULL != m->bitmap) {
|
|
|
|
|
free(m->bitmap);
|
|
|
|
|
m->bitmap = NULL;
|
|
|
|
|
}
|
2020-03-19 21:23:54 -04:00
|
|
|
|
if (NULL != m->name) {
|
|
|
|
|
free(m->name);
|
|
|
|
|
}
|
2025-06-08 01:12:33 -04:00
|
|
|
|
if (NULL != m->path) {
|
|
|
|
|
free(m->path);
|
|
|
|
|
}
|
2020-02-28 18:29:35 -05:00
|
|
|
|
free((void *)m);
|
|
|
|
|
}
|
|
|
|
|
}
|
2011-06-14 21:15:23 +03:00
|
|
|
|
|
libclamav: scan-layer callback API functions
Add the following scan callbacks:
```c
cl_engine_set_scan_callback(engine, &pre_hash_callback, CL_SCAN_CALLBACK_PRE_HASH);
cl_engine_set_scan_callback(engine, &pre_scan_callback, CL_SCAN_CALLBACK_PRE_SCAN);
cl_engine_set_scan_callback(engine, &post_scan_callback, CL_SCAN_CALLBACK_POST_SCAN);
cl_engine_set_scan_callback(engine, &alert_callback, CL_SCAN_CALLBACK_ALERT);
cl_engine_set_scan_callback(engine, &file_type_callback, CL_SCAN_CALLBACK_FILE_TYPE);
```
Each callback may alter scan behavior using the following return codes:
* CL_BREAK
Scan aborted by callback (the rest of the scan is skipped).
This does not mark the file as clean or infected, it just skips the rest of the scan.
* CL_SUCCESS / CL_CLEAN
File scan will continue.
This is different than CL_VERIFIED because it does not affect prior or future alerts.
Return CL_VERIFIED instead if you want to remove prior alerts for this layer and skip
the rest of the scan for this layer.
* CL_VIRUS
This means you don't trust the file. A new alert will be added.
For CL_SCAN_CALLBACK_ALERT: Means you agree with the alert (no extra alert needed).
* CL_VERIFIED
Layer explicitly trusted by the callback and previous alerts removed FOR THIS layer.
You might want to do this if you trust the hash or verified a digital signature.
The rest of the scan will be skipped FOR THIS layer.
For contained files, this does NOT mean that the parent or adjacent layers are trusted.
Each callback is given a pointer to the current scan layer from which
they can get previous layers, can get the the layer's fmap, and then
various attributes of the layer and of the fmap such as:
- layer recursion level
- layer object id
- layer file type
- layer attributes (was decerypted, normalized, embedded, or re-typed)
- layer last alert
- fmap name
- fmap hash (md5, sha1, or sha2-256)
- fmap data (pointer and size)
- fmap file descriptor, if any (fd, offset, size)
- fmap filepath, if any (filepath, offset, size)
To make this possible, this commits introduced a handful of new APIs to
query scan-layer details and fmap details:
- `cl_error_t cl_fmap_set_name(cl_fmap_t *map, const char *name);`
- `cl_error_t cl_fmap_get_name(cl_fmap_t *map, const char **name_out);`
- `cl_error_t cl_fmap_set_path(cl_fmap_t *map, const char *path);`
- `cl_error_t cl_fmap_get_path(cl_fmap_t *map, const char **path_out, size_t *offset_out, size_t *len_out);`
- `cl_error_t cl_fmap_get_fd(const cl_fmap_t *map, int *fd_out, size_t *offset_out, size_t *len_out);`
- `cl_error_t cl_fmap_get_size(const cl_fmap_t *map, size_t *size_out);`
- `cl_error_t cl_fmap_set_hash(const cl_fmap_t *map, const char *hash_alg, char hash);`
- `cl_error_t cl_fmap_have_hash(const cl_fmap_t *map, const char *hash_alg, bool *have_hash_out);`
- `cl_error_t cl_fmap_will_need_hash_later(const cl_fmap_t *map, const char *hash_alg);`
- `cl_error_t cl_fmap_get_hash(const cl_fmap_t *map, const char *hash_alg, const char **hash_out);`
- `cl_error_t cl_fmap_get_data(const cl_fmap_t *map, size_t offset, size_t len, const uint8_t **data_out, size_t *data_len_out);`
- `cl_error_t cl_scan_layer_get_fmap(cl_scan_layer_t *layer, cl_fmap_t **fmap_out);`
- `cl_error_t cl_scan_layer_get_parent_layer(cl_scan_layer_t *layer, cl_scan_layer_t **parent_layer_out);`
- `cl_error_t cl_scan_layer_get_type(cl_scan_layer_t *layer, const char **type_out);`
- `cl_error_t cl_scan_layer_get_recursion_level(cl_scan_layer_t *layer, uint32_t *recursion_level_out);`
- `cl_error_t cl_scan_layer_get_object_id(cl_scan_layer_t *layer, uint64_t *object_id_out);`
- `cl_error_t cl_scan_layer_get_last_alert(cl_scan_layer_t *layer, const char **alert_name_out);`
- `cl_error_t cl_scan_layer_get_attributes(cl_scan_layer_t *layer, uint32_t *attributes_out);`
This commit deprecates but does not remove the existing scan callbacks:
- `void cl_engine_set_clcb_pre_cache(struct cl_engine *engine, clcb_pre_cache callback);`
- `void cl_engine_set_clcb_file_inspection(struct cl_engine *engine, clcb_file_inspection callback);`
- `void cl_engine_set_clcb_pre_scan(struct cl_engine *engine, clcb_pre_scan callback);`
- `void cl_engine_set_clcb_post_scan(struct cl_engine *engine, clcb_post_scan callback);`
- `void cl_engine_set_clcb_virus_found(struct cl_engine *engine, clcb_virus_found callback);`
- `void cl_engine_set_clcb_hash(struct cl_engine *engine, clcb_hash callback);`
This commit also adds an interactive test program to demonstrate the callbacks.
See: `examples/ex_scan_callbacks.c`
CLAM-255
CLAM-2485
CLAM-2626
2025-06-22 14:37:03 -04:00
|
|
|
|
cl_fmap_t *fmap_open_handle(void *handle, size_t offset, size_t len,
|
|
|
|
|
clcb_pread pread_cb, int use_aging)
|
2011-06-14 21:15:23 +03:00
|
|
|
|
{
|
2020-02-28 18:29:35 -05:00
|
|
|
|
cl_error_t status = CL_EMEM;
|
2021-03-31 12:16:41 -07:00
|
|
|
|
uint64_t pages;
|
2020-02-28 18:29:35 -05:00
|
|
|
|
size_t mapsz, bitmap_size;
|
|
|
|
|
cl_fmap_t *m = NULL;
|
|
|
|
|
int pgsz = cli_getpagesize();
|
2011-06-14 21:15:23 +03:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if ((off_t)offset < 0 || offset != fmap_align_to(offset, pgsz)) {
|
|
|
|
|
cli_warnmsg("fmap: attempted mapping with unaligned offset\n");
|
2020-02-28 18:29:35 -05:00
|
|
|
|
goto done;
|
2009-08-19 20:42:40 +02:00
|
|
|
|
}
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (!len) {
|
|
|
|
|
cli_dbgmsg("fmap: attempted void mapping\n");
|
2020-02-28 18:29:35 -05:00
|
|
|
|
goto done;
|
2009-08-19 20:42:40 +02:00
|
|
|
|
}
|
2011-06-14 21:15:23 +03:00
|
|
|
|
if (offset >= len) {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
cli_warnmsg("fmap: attempted oof mapping\n");
|
2020-02-28 18:29:35 -05:00
|
|
|
|
goto done;
|
2009-08-19 20:42:40 +02:00
|
|
|
|
}
|
2009-10-10 19:10:15 +02:00
|
|
|
|
|
2009-08-19 20:42:40 +02:00
|
|
|
|
pages = fmap_align_items(len, pgsz);
|
2020-02-28 18:29:35 -05:00
|
|
|
|
|
2021-03-31 12:16:41 -07:00
|
|
|
|
bitmap_size = pages * sizeof(uint64_t);
|
2020-02-28 18:29:35 -05:00
|
|
|
|
mapsz = pages * pgsz;
|
|
|
|
|
|
2022-05-08 14:59:09 -07:00
|
|
|
|
m = calloc(1, sizeof(fmap_t));
|
2020-02-28 18:29:35 -05:00
|
|
|
|
if (!m) {
|
|
|
|
|
cli_warnmsg("fmap: map header allocation failed\n");
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-09 14:28:34 -07:00
|
|
|
|
m->bitmap = cli_max_calloc(1, bitmap_size);
|
2020-04-08 16:04:20 -04:00
|
|
|
|
if (!m->bitmap) {
|
2020-02-28 18:29:35 -05:00
|
|
|
|
cli_warnmsg("fmap: map header allocation failed\n");
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
2011-06-14 21:15:23 +03:00
|
|
|
|
|
|
|
|
|
#ifndef ANONYMOUS_MAP
|
|
|
|
|
use_aging = 0;
|
|
|
|
|
#endif
|
2010-05-15 22:32:44 +02:00
|
|
|
|
#ifdef ANONYMOUS_MAP
|
2011-06-14 21:15:23 +03:00
|
|
|
|
if (use_aging) {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
fmap_lock;
|
2020-02-28 18:29:35 -05:00
|
|
|
|
if ((m->data = (fmap_t *)mmap(NULL,
|
|
|
|
|
mapsz,
|
|
|
|
|
PROT_READ | PROT_WRITE, MAP_PRIVATE | /* FIXME: MAP_POPULATE is ~8% faster but more memory intensive */ ANONYMOUS_MAP,
|
|
|
|
|
-1,
|
|
|
|
|
0)) == MAP_FAILED) {
|
|
|
|
|
m->data = NULL;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
} else {
|
2009-10-10 16:37:55 +02:00
|
|
|
|
#if HAVE_MADVISE
|
2020-02-28 18:29:35 -05:00
|
|
|
|
madvise((void *)m->data, mapsz, MADV_RANDOM | MADV_DONTFORK);
|
2009-10-10 16:37:55 +02:00
|
|
|
|
#endif /* madvise */
|
2018-12-03 12:40:13 -05:00
|
|
|
|
}
|
|
|
|
|
fmap_unlock;
|
2009-09-11 22:58:31 +02:00
|
|
|
|
}
|
2010-05-15 22:32:44 +02:00
|
|
|
|
#endif /* ANONYMOUS_MAP */
|
2011-06-14 21:15:23 +03:00
|
|
|
|
if (!use_aging) {
|
2022-05-09 14:28:34 -07:00
|
|
|
|
m->data = (fmap_t *)cli_max_malloc(mapsz);
|
2018-12-03 12:40:13 -05:00
|
|
|
|
}
|
2020-02-28 18:29:35 -05:00
|
|
|
|
if (!m->data) {
|
2014-04-15 10:27:34 -04:00
|
|
|
|
cli_warnmsg("fmap: map allocation failed\n");
|
2020-02-28 18:29:35 -05:00
|
|
|
|
goto done;
|
2014-04-15 10:27:34 -04:00
|
|
|
|
}
|
2018-12-03 12:40:13 -05:00
|
|
|
|
m->handle = handle;
|
|
|
|
|
m->pread_cb = pread_cb;
|
2023-12-06 17:33:21 -05:00
|
|
|
|
m->aging = use_aging != 0 ? true : false;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
m->offset = offset;
|
|
|
|
|
m->nested_offset = 0;
|
|
|
|
|
m->len = len; /* m->nested_offset + m->len = m->real_len */
|
|
|
|
|
m->real_len = len;
|
|
|
|
|
m->pages = pages;
|
|
|
|
|
m->pgsz = pgsz;
|
|
|
|
|
m->paged = 0;
|
2022-08-03 20:34:48 -07:00
|
|
|
|
m->dont_cache_flag = false;
|
2020-02-28 18:29:35 -05:00
|
|
|
|
m->unmap = unmap_handle;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
m->need = handle_need;
|
|
|
|
|
m->need_offstr = handle_need_offstr;
|
|
|
|
|
m->gets = handle_gets;
|
|
|
|
|
m->unneed_off = handle_unneed_off;
|
2023-12-06 17:33:21 -05:00
|
|
|
|
m->handle_is_fd = true;
|
2025-06-03 19:03:20 -04:00
|
|
|
|
|
|
|
|
|
memset(m->will_need_hash, 0, sizeof(m->will_need_hash));
|
|
|
|
|
memset(m->have_hash, 0, sizeof(m->have_hash));
|
2020-02-28 18:29:35 -05:00
|
|
|
|
|
|
|
|
|
status = CL_SUCCESS;
|
|
|
|
|
|
|
|
|
|
done:
|
|
|
|
|
if (CL_SUCCESS != status) {
|
|
|
|
|
unmap_handle(m);
|
|
|
|
|
m = NULL;
|
|
|
|
|
}
|
2009-08-19 20:42:40 +02:00
|
|
|
|
return m;
|
|
|
|
|
}
|
2009-08-20 01:07:33 +02:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
static void fmap_aging(fmap_t *m)
|
|
|
|
|
{
|
2010-05-15 22:32:44 +02:00
|
|
|
|
#ifdef ANONYMOUS_MAP
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (!m->aging) return;
|
|
|
|
|
if (m->paged * m->pgsz > UNPAGE_THRSHLD_HI) { /* we alloc'd too much */
|
2021-03-31 12:16:41 -07:00
|
|
|
|
uint64_t i, avail = 0, freeme[2048], maxavail = MIN(sizeof(freeme) / sizeof(*freeme), m->paged - UNPAGE_THRSHLD_LO / m->pgsz) - 1;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
|
|
|
|
|
for (i = 0; i < m->pages; i++) {
|
2021-03-31 12:16:41 -07:00
|
|
|
|
uint64_t s = fmap_bitmap[i];
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if ((s & (FM_MASK_PAGED | FM_MASK_LOCKED)) == FM_MASK_PAGED) {
|
|
|
|
|
/* page is paged and not locked: dec age */
|
|
|
|
|
if (s & FM_MASK_COUNT) fmap_bitmap[i]--;
|
|
|
|
|
/* and make it available for unpaging */
|
|
|
|
|
|
|
|
|
|
if (!avail) {
|
|
|
|
|
freeme[0] = i;
|
|
|
|
|
avail++;
|
|
|
|
|
} else {
|
|
|
|
|
/* Insert sort onto a stack'd array - same performance as quickselect */
|
2021-03-31 12:16:41 -07:00
|
|
|
|
uint64_t insert_to = MIN(maxavail, avail) - 1, age = fmap_bitmap[i] & FM_MASK_COUNT;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (avail <= maxavail || (fmap_bitmap[freeme[maxavail]] & FM_MASK_COUNT) > age) {
|
|
|
|
|
while ((fmap_bitmap[freeme[insert_to]] & FM_MASK_COUNT) > age) {
|
|
|
|
|
freeme[insert_to + 1] = freeme[insert_to];
|
|
|
|
|
if (!insert_to--) break;
|
|
|
|
|
}
|
|
|
|
|
freeme[insert_to + 1] = i;
|
|
|
|
|
if (avail <= maxavail) avail++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (avail) { /* at least one page is paged and not locked */
|
|
|
|
|
char *lastpage = NULL;
|
|
|
|
|
char *firstpage = NULL;
|
|
|
|
|
for (i = 0; i < avail; i++) {
|
2020-02-28 18:29:35 -05:00
|
|
|
|
char *pptr = (char *)m->data + freeme[i] * m->pgsz;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
/* we mark the page as seen */
|
|
|
|
|
fmap_bitmap[freeme[i]] = FM_MASK_SEEN;
|
|
|
|
|
/* and we mmap the page over so the kernel knows there's nothing good in there */
|
|
|
|
|
/* reduce number of mmap calls: if pages are adjacent only do 1 mmap call */
|
|
|
|
|
if (lastpage && pptr == lastpage) {
|
|
|
|
|
lastpage = pptr + m->pgsz;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (!lastpage) {
|
|
|
|
|
firstpage = pptr;
|
|
|
|
|
lastpage = pptr + m->pgsz;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
fmap_lock;
|
|
|
|
|
if (mmap(firstpage, lastpage - firstpage, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_PRIVATE | ANONYMOUS_MAP, -1, 0) == MAP_FAILED)
|
|
|
|
|
cli_dbgmsg("fmap_aging: kernel hates you\n");
|
|
|
|
|
fmap_unlock;
|
|
|
|
|
firstpage = pptr;
|
|
|
|
|
lastpage = pptr + m->pgsz;
|
|
|
|
|
}
|
|
|
|
|
if (lastpage) {
|
|
|
|
|
fmap_lock;
|
|
|
|
|
if (mmap(firstpage, lastpage - firstpage, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_PRIVATE | ANONYMOUS_MAP, -1, 0) == MAP_FAILED)
|
|
|
|
|
cli_dbgmsg("fmap_aging: kernel hates you\n");
|
|
|
|
|
fmap_unlock;
|
|
|
|
|
}
|
|
|
|
|
m->paged -= avail;
|
|
|
|
|
}
|
2009-08-23 20:06:39 +02:00
|
|
|
|
}
|
2021-10-28 16:58:21 -07:00
|
|
|
|
#else
|
|
|
|
|
UNUSEDPARAM(m);
|
2009-09-07 11:15:18 +02:00
|
|
|
|
#endif
|
2009-08-23 20:06:39 +02:00
|
|
|
|
}
|
|
|
|
|
|
2021-03-31 12:16:41 -07:00
|
|
|
|
static int fmap_readpage(fmap_t *m, uint64_t first_page, uint64_t count, uint64_t lock_count)
|
2018-12-03 12:40:13 -05:00
|
|
|
|
{
|
2010-10-14 16:16:15 +02:00
|
|
|
|
size_t readsz = 0, eintr_off;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
char *pptr = NULL, errtxt[256];
|
2021-03-31 12:16:41 -07:00
|
|
|
|
uint64_t sbitmap;
|
2020-01-30 09:15:44 -08:00
|
|
|
|
uint64_t i, page = first_page, force_read = 0;
|
|
|
|
|
|
2021-03-31 12:16:41 -07:00
|
|
|
|
if ((uint64_t)(m->real_len) > (uint64_t)(m->pages * m->pgsz)) {
|
|
|
|
|
cli_dbgmsg("fmap_readpage: size of file exceeds total prefaultible page size (unpacked file is too large)\n");
|
2020-01-30 09:15:44 -08:00
|
|
|
|
return 1;
|
|
|
|
|
}
|
2009-08-27 18:29:03 +02:00
|
|
|
|
|
2009-10-10 15:39:31 +02:00
|
|
|
|
fmap_lock;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
for (i = 0; i < count; i++) { /* prefault */
|
|
|
|
|
/* Not worth checking if the page is already paged, just ping each */
|
|
|
|
|
/* Also not worth reusing the loop below */
|
|
|
|
|
volatile char faultme;
|
2020-02-28 18:29:35 -05:00
|
|
|
|
faultme = ((char *)m->data)[(first_page + i) * m->pgsz];
|
2021-10-03 14:13:55 -07:00
|
|
|
|
(void)faultme; // silence "warning: variable ‘faultme’ set but not used"
|
2009-09-10 03:20:02 +02:00
|
|
|
|
}
|
2009-10-10 15:39:31 +02:00
|
|
|
|
fmap_unlock;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
for (i = 0; i <= count; i++, page++) {
|
|
|
|
|
int lock;
|
|
|
|
|
if (lock_count) {
|
|
|
|
|
lock_count--;
|
|
|
|
|
lock = 1;
|
|
|
|
|
} else
|
|
|
|
|
lock = 0;
|
|
|
|
|
if (i == count) {
|
|
|
|
|
/* we count one page too much to flush pending reads */
|
|
|
|
|
if (!pptr) return 0; /* if we have any */
|
|
|
|
|
force_read = 1;
|
2020-01-30 09:15:44 -08:00
|
|
|
|
} else if ((sbitmap = fmap_bitmap[page]) & FM_MASK_PAGED) {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
/* page already paged */
|
|
|
|
|
if (lock) {
|
|
|
|
|
/* we want locking */
|
2020-01-30 09:15:44 -08:00
|
|
|
|
if (sbitmap & FM_MASK_LOCKED) {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
/* page already locked */
|
2020-01-30 09:15:44 -08:00
|
|
|
|
sbitmap &= FM_MASK_COUNT;
|
|
|
|
|
if (sbitmap == FM_MASK_COUNT) { /* lock count already at max: fial! */
|
2018-12-03 12:40:13 -05:00
|
|
|
|
cli_errmsg("fmap_readpage: lock count exceeded\n");
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
/* acceptable lock count: inc lock count */
|
|
|
|
|
fmap_bitmap[page]++;
|
|
|
|
|
} else /* page not currently locked: set lock count = 1 */
|
|
|
|
|
fmap_bitmap[page] = 1 | FM_MASK_LOCKED | FM_MASK_PAGED;
|
|
|
|
|
} else {
|
|
|
|
|
/* we don't want locking */
|
2020-01-30 09:15:44 -08:00
|
|
|
|
if (!(sbitmap & FM_MASK_LOCKED)) {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
/* page is not locked: we reset aging to max */
|
|
|
|
|
fmap_bitmap[page] = FM_MASK_PAGED | FM_MASK_COUNT;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (!pptr) continue;
|
|
|
|
|
force_read = 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (force_read) {
|
|
|
|
|
/* we have some pending reads to perform */
|
|
|
|
|
if (m->handle_is_fd) {
|
2021-03-31 12:16:41 -07:00
|
|
|
|
uint64_t j;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
int _fd = (int)(ptrdiff_t)m->handle;
|
|
|
|
|
for (j = first_page; j < page; j++) {
|
|
|
|
|
if (fmap_bitmap[j] & FM_MASK_SEEN) {
|
|
|
|
|
/* page we've seen before: check mtime */
|
|
|
|
|
STATBUF st;
|
|
|
|
|
if (FSTAT(_fd, &st)) {
|
|
|
|
|
cli_strerror(errno, errtxt, sizeof(errtxt));
|
|
|
|
|
cli_warnmsg("fmap_readpage: fstat failed: %s\n", errtxt);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
2023-12-06 17:33:21 -05:00
|
|
|
|
if (m->mtime != (uint64_t)st.st_mtime) {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
cli_warnmsg("fmap_readpage: file changed as we read it\n");
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
eintr_off = 0;
|
|
|
|
|
while (readsz) {
|
|
|
|
|
ssize_t got;
|
2021-10-03 14:13:55 -07:00
|
|
|
|
uint64_t target_offset = eintr_off + m->offset + (first_page * m->pgsz);
|
|
|
|
|
got = m->pread_cb(m->handle, pptr, readsz, target_offset);
|
2018-12-03 12:40:13 -05:00
|
|
|
|
|
|
|
|
|
if (got < 0 && errno == EINTR)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (got > 0) {
|
|
|
|
|
pptr += got;
|
|
|
|
|
eintr_off += got;
|
|
|
|
|
readsz -= got;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (got < 0) {
|
|
|
|
|
cli_strerror(errno, errtxt, sizeof(errtxt));
|
|
|
|
|
cli_errmsg("fmap_readpage: pread error: %s\n", errtxt);
|
|
|
|
|
} else {
|
2021-10-03 14:13:55 -07:00
|
|
|
|
cli_warnmsg("fmap_readpage: pread fail: asked for %zu bytes @ offset " STDu64 ", got %zd\n", readsz, target_offset, got);
|
2018-12-03 12:40:13 -05:00
|
|
|
|
}
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pptr = NULL;
|
|
|
|
|
force_read = 0;
|
|
|
|
|
readsz = 0;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* page is not already paged */
|
|
|
|
|
if (!pptr) {
|
|
|
|
|
/* set a new start for pending reads if we don't have one */
|
2020-02-28 18:29:35 -05:00
|
|
|
|
pptr = (char *)m->data + page * m->pgsz;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
first_page = page;
|
|
|
|
|
}
|
|
|
|
|
if ((page == m->pages - 1) && (m->real_len % m->pgsz))
|
|
|
|
|
readsz += m->real_len % m->pgsz;
|
|
|
|
|
else
|
|
|
|
|
readsz += m->pgsz;
|
|
|
|
|
if (lock) /* lock requested: set paged, lock page and set lock count to 1 */
|
|
|
|
|
fmap_bitmap[page] = FM_MASK_PAGED | FM_MASK_LOCKED | 1;
|
|
|
|
|
else /* no locking: set paged and set aging to max */
|
|
|
|
|
fmap_bitmap[page] = FM_MASK_PAGED | FM_MASK_COUNT;
|
|
|
|
|
m->paged++;
|
2009-08-23 20:06:39 +02:00
|
|
|
|
}
|
2009-08-20 01:07:33 +02:00
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
static const void *handle_need(fmap_t *m, size_t at, size_t len, int lock)
|
|
|
|
|
{
|
2020-01-30 09:15:44 -08:00
|
|
|
|
uint64_t first_page, last_page, lock_count;
|
2009-08-20 01:07:33 +02:00
|
|
|
|
char *ret;
|
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (!len)
|
|
|
|
|
return NULL;
|
2009-08-20 01:07:33 +02:00
|
|
|
|
|
2011-06-17 23:08:31 +03:00
|
|
|
|
at += m->nested_offset;
|
2021-10-28 16:58:21 -07:00
|
|
|
|
if (!CLI_ISCONTAINED(m->nested_offset, m->len, at, len))
|
2018-12-03 12:40:13 -05:00
|
|
|
|
return NULL;
|
2009-08-24 10:17:45 +02:00
|
|
|
|
|
|
|
|
|
fmap_aging(m);
|
|
|
|
|
|
2009-08-20 01:07:33 +02:00
|
|
|
|
first_page = fmap_which_page(m, at);
|
2018-12-03 12:40:13 -05:00
|
|
|
|
last_page = fmap_which_page(m, at + len - 1);
|
|
|
|
|
lock_count = (lock != 0) * (last_page - first_page + 1);
|
2009-08-27 12:08:54 +02:00
|
|
|
|
#ifdef READAHED_PAGES
|
|
|
|
|
last_page += READAHED_PAGES;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (last_page >= m->pages) last_page = m->pages - 1;
|
2009-08-27 12:08:54 +02:00
|
|
|
|
#endif
|
2009-08-20 01:07:33 +02:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (fmap_readpage(m, first_page, last_page - first_page + 1, lock_count))
|
|
|
|
|
return NULL;
|
2009-08-23 20:06:39 +02:00
|
|
|
|
|
2020-02-28 18:29:35 -05:00
|
|
|
|
ret = (char *)m->data + at;
|
2009-09-07 11:15:18 +02:00
|
|
|
|
return (void *)ret;
|
2009-08-20 01:07:33 +02:00
|
|
|
|
}
|
|
|
|
|
|
2021-03-31 12:16:41 -07:00
|
|
|
|
static void fmap_unneed_page(fmap_t *m, uint64_t page)
|
2018-12-03 12:40:13 -05:00
|
|
|
|
{
|
2021-03-31 12:16:41 -07:00
|
|
|
|
uint64_t s = fmap_bitmap[page];
|
2009-08-24 10:17:45 +02:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if ((s & (FM_MASK_PAGED | FM_MASK_LOCKED)) == (FM_MASK_PAGED | FM_MASK_LOCKED)) {
|
|
|
|
|
/* page is paged and locked: check lock count */
|
|
|
|
|
s &= FM_MASK_COUNT;
|
|
|
|
|
if (s > 1) /* locked more than once: dec lock count */
|
|
|
|
|
fmap_bitmap[page]--;
|
|
|
|
|
else if (s == 1) /* only one lock left: unlock and begin aging */
|
|
|
|
|
fmap_bitmap[page] = FM_MASK_COUNT | FM_MASK_PAGED;
|
|
|
|
|
else
|
|
|
|
|
cli_errmsg("fmap_unneed: inconsistent map state\n");
|
|
|
|
|
return;
|
2009-08-23 20:06:39 +02:00
|
|
|
|
}
|
|
|
|
|
cli_warnmsg("fmap_unneed: unneed on a unlocked page\n");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
static void handle_unneed_off(fmap_t *m, size_t at, size_t len)
|
|
|
|
|
{
|
2021-03-31 12:16:41 -07:00
|
|
|
|
uint64_t i, first_page, last_page;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (!m->aging) return;
|
|
|
|
|
if (!len) {
|
|
|
|
|
cli_warnmsg("fmap_unneed: attempted void unneed\n");
|
|
|
|
|
return;
|
2009-08-23 21:21:13 +02:00
|
|
|
|
}
|
|
|
|
|
|
2013-10-24 18:01:41 -04:00
|
|
|
|
at += m->nested_offset;
|
2021-10-28 16:58:21 -07:00
|
|
|
|
if (!CLI_ISCONTAINED(m->nested_offset, m->len, at, len)) {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
cli_warnmsg("fmap: attempted oof unneed\n");
|
|
|
|
|
return;
|
2009-08-23 21:21:13 +02:00
|
|
|
|
}
|
2009-08-24 10:17:45 +02:00
|
|
|
|
|
2009-08-23 21:21:13 +02:00
|
|
|
|
first_page = fmap_which_page(m, at);
|
2018-12-03 12:40:13 -05:00
|
|
|
|
last_page = fmap_which_page(m, at + len - 1);
|
2009-08-23 21:21:13 +02:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
for (i = first_page; i <= last_page; i++) {
|
|
|
|
|
fmap_unneed_page(m, i);
|
2009-08-23 21:21:13 +02:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-06-14 20:33:15 +03:00
|
|
|
|
static void unmap_mmap(fmap_t *m)
|
|
|
|
|
{
|
2010-05-15 22:32:44 +02:00
|
|
|
|
#ifdef ANONYMOUS_MAP
|
2020-02-28 18:29:35 -05:00
|
|
|
|
size_t len = m->pages * m->pgsz;
|
2011-06-14 20:33:15 +03:00
|
|
|
|
fmap_lock;
|
2020-02-28 18:29:35 -05:00
|
|
|
|
if (munmap((void *)m->data, len) == -1) /* munmap() failed */
|
2025-06-08 01:12:33 -04:00
|
|
|
|
cli_warnmsg("fmap_free: unable to unmap memory segment at address: %p with length: %zu\n", (void *)m->data, len);
|
2011-06-14 20:33:15 +03:00
|
|
|
|
fmap_unlock;
|
2021-10-28 16:58:21 -07:00
|
|
|
|
#else
|
|
|
|
|
UNUSEDPARAM(m);
|
2009-09-07 11:15:18 +02:00
|
|
|
|
#endif
|
2009-08-20 02:19:57 +02:00
|
|
|
|
}
|
2009-09-05 20:16:10 +02:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
static void unmap_malloc(fmap_t *m)
|
|
|
|
|
{
|
2020-02-28 18:29:35 -05:00
|
|
|
|
if (NULL != m) {
|
2020-03-19 21:23:54 -04:00
|
|
|
|
if (NULL != m->name) {
|
|
|
|
|
free(m->name);
|
|
|
|
|
}
|
2025-06-08 01:12:33 -04:00
|
|
|
|
if (NULL != m->path) {
|
|
|
|
|
free(m->path);
|
|
|
|
|
}
|
2020-02-28 18:29:35 -05:00
|
|
|
|
free((void *)m);
|
|
|
|
|
}
|
2011-06-14 20:33:15 +03:00
|
|
|
|
}
|
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
static const void *handle_need_offstr(fmap_t *m, size_t at, size_t len_hint)
|
|
|
|
|
{
|
2021-03-31 12:16:41 -07:00
|
|
|
|
uint64_t i, first_page, last_page;
|
2021-10-03 14:13:55 -07:00
|
|
|
|
void *ptr;
|
|
|
|
|
|
|
|
|
|
at += m->nested_offset;
|
|
|
|
|
ptr = (void *)((char *)m->data + at);
|
2009-09-05 20:16:10 +02:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (!len_hint || len_hint > m->real_len - at)
|
|
|
|
|
len_hint = m->real_len - at;
|
2009-09-05 20:16:10 +02:00
|
|
|
|
|
2021-10-28 16:58:21 -07:00
|
|
|
|
if (!CLI_ISCONTAINED(m->nested_offset, m->len, at, len_hint))
|
2018-12-03 12:40:13 -05:00
|
|
|
|
return NULL;
|
2009-09-05 20:16:10 +02:00
|
|
|
|
|
|
|
|
|
fmap_aging(m);
|
|
|
|
|
|
|
|
|
|
first_page = fmap_which_page(m, at);
|
2018-12-03 12:40:13 -05:00
|
|
|
|
last_page = fmap_which_page(m, at + len_hint - 1);
|
|
|
|
|
|
|
|
|
|
for (i = first_page; i <= last_page; i++) {
|
2020-02-28 18:29:35 -05:00
|
|
|
|
char *thispage = (char *)m->data + i * m->pgsz;
|
2021-03-31 12:16:41 -07:00
|
|
|
|
uint64_t scanat, scansz;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
|
|
|
|
|
if (fmap_readpage(m, i, 1, 1)) {
|
|
|
|
|
last_page = i - 1;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
if (i == first_page) {
|
|
|
|
|
scanat = at % m->pgsz;
|
|
|
|
|
scansz = MIN(len_hint, m->pgsz - scanat);
|
|
|
|
|
} else {
|
|
|
|
|
scanat = 0;
|
|
|
|
|
scansz = MIN(len_hint, m->pgsz);
|
|
|
|
|
}
|
|
|
|
|
len_hint -= scansz;
|
|
|
|
|
if (memchr(&thispage[scanat], 0, scansz))
|
|
|
|
|
return ptr;
|
|
|
|
|
}
|
|
|
|
|
for (i = first_page; i <= last_page; i++)
|
|
|
|
|
fmap_unneed_page(m, i);
|
2009-09-05 20:16:10 +02:00
|
|
|
|
return NULL;
|
|
|
|
|
}
|
2009-09-10 03:20:02 +02:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
static const void *handle_gets(fmap_t *m, char *dst, size_t *at, size_t max_len)
|
|
|
|
|
{
|
2021-03-31 12:16:41 -07:00
|
|
|
|
uint64_t i, first_page, last_page;
|
2021-10-03 14:13:55 -07:00
|
|
|
|
char *src = (char *)m->data + m->nested_offset + *at;
|
2020-02-28 18:29:35 -05:00
|
|
|
|
char *endptr = NULL;
|
2021-10-03 14:13:55 -07:00
|
|
|
|
size_t len = MIN(max_len - 1, m->len - *at);
|
2020-02-28 18:29:35 -05:00
|
|
|
|
size_t fullen = len;
|
2009-09-10 03:20:02 +02:00
|
|
|
|
|
2021-10-28 16:58:21 -07:00
|
|
|
|
if (!len || !CLI_ISCONTAINED_0_TO(m->len, *at, len))
|
2018-12-03 12:40:13 -05:00
|
|
|
|
return NULL;
|
2009-09-10 03:20:02 +02:00
|
|
|
|
|
|
|
|
|
fmap_aging(m);
|
|
|
|
|
|
2021-10-03 14:13:55 -07:00
|
|
|
|
first_page = fmap_which_page(m, m->nested_offset + *at);
|
|
|
|
|
last_page = fmap_which_page(m, m->nested_offset + *at + len - 1);
|
2018-12-03 12:40:13 -05:00
|
|
|
|
|
|
|
|
|
for (i = first_page; i <= last_page; i++) {
|
2020-02-28 18:29:35 -05:00
|
|
|
|
char *thispage = (char *)m->data + i * m->pgsz;
|
2021-03-31 12:16:41 -07:00
|
|
|
|
uint64_t scanat, scansz;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
|
|
|
|
|
if (fmap_readpage(m, i, 1, 0))
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
if (i == first_page) {
|
2021-10-28 16:58:21 -07:00
|
|
|
|
scanat = (m->nested_offset + *at) % m->pgsz;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
scansz = MIN(len, m->pgsz - scanat);
|
|
|
|
|
} else {
|
|
|
|
|
scanat = 0;
|
|
|
|
|
scansz = MIN(len, m->pgsz);
|
|
|
|
|
}
|
|
|
|
|
len -= scansz;
|
|
|
|
|
|
|
|
|
|
if ((endptr = memchr(&thispage[scanat], '\n', scansz))) {
|
|
|
|
|
endptr++;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (endptr) {
|
|
|
|
|
memcpy(dst, src, endptr - src);
|
|
|
|
|
dst[endptr - src] = '\0';
|
|
|
|
|
*at += endptr - src;
|
2009-09-10 03:20:02 +02:00
|
|
|
|
} else {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
memcpy(dst, src, fullen);
|
|
|
|
|
dst[fullen] = '\0';
|
|
|
|
|
*at += fullen;
|
2009-09-10 03:20:02 +02:00
|
|
|
|
}
|
|
|
|
|
return dst;
|
|
|
|
|
}
|
2009-10-10 19:10:15 +02:00
|
|
|
|
|
2011-06-14 21:26:59 +03:00
|
|
|
|
/* vvvvv MEMORY STUFF BELOW vvvvv */
|
2009-10-10 19:10:15 +02:00
|
|
|
|
|
2011-06-14 21:26:59 +03:00
|
|
|
|
static const void *mem_need(fmap_t *m, size_t at, size_t len, int lock);
|
2011-06-14 20:59:30 +03:00
|
|
|
|
static void mem_unneed_off(fmap_t *m, size_t at, size_t len);
|
|
|
|
|
static const void *mem_need_offstr(fmap_t *m, size_t at, size_t len_hint);
|
|
|
|
|
static const void *mem_gets(fmap_t *m, char *dst, size_t *at, size_t max_len);
|
|
|
|
|
|
2020-03-19 21:23:54 -04:00
|
|
|
|
fmap_t *fmap_open_memory(const void *start, size_t len, const char *name)
|
2011-06-14 21:19:01 +03:00
|
|
|
|
{
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
cl_error_t status = CL_ERROR;
|
2021-01-23 16:41:41 -08:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
int pgsz = cli_getpagesize();
|
2022-05-08 14:59:09 -07:00
|
|
|
|
cl_fmap_t *m = calloc(1, sizeof(*m));
|
2011-06-14 21:19:01 +03:00
|
|
|
|
if (!m) {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
cli_warnmsg("fmap: map allocation failed\n");
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
goto done;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
}
|
|
|
|
|
m->data = start;
|
|
|
|
|
m->len = len;
|
|
|
|
|
m->real_len = len;
|
|
|
|
|
m->pgsz = pgsz;
|
|
|
|
|
m->pages = fmap_align_items(len, pgsz);
|
|
|
|
|
m->unmap = unmap_malloc;
|
|
|
|
|
m->need = mem_need;
|
2011-06-14 21:19:01 +03:00
|
|
|
|
m->need_offstr = mem_need_offstr;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
m->gets = mem_gets;
|
|
|
|
|
m->unneed_off = mem_unneed_off;
|
2020-03-19 21:23:54 -04:00
|
|
|
|
|
|
|
|
|
if (NULL != name) {
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
/* Copy the name, if one is given */
|
2024-01-09 17:44:33 -05:00
|
|
|
|
m->name = cli_safer_strdup(name);
|
2020-03-19 21:23:54 -04:00
|
|
|
|
if (NULL == m->name) {
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
cli_warnmsg("fmap: failed to duplicate map name\n");
|
|
|
|
|
goto done;
|
2020-03-19 21:23:54 -04:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
status = CL_SUCCESS;
|
|
|
|
|
|
|
|
|
|
done:
|
|
|
|
|
if (CL_SUCCESS != status) {
|
2021-10-26 13:19:40 -07:00
|
|
|
|
if (NULL != m) {
|
|
|
|
|
if (NULL != m->name) {
|
|
|
|
|
free(m->name);
|
|
|
|
|
}
|
|
|
|
|
free(m);
|
|
|
|
|
m = NULL;
|
2021-01-23 16:41:41 -08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-06-14 22:54:44 +03:00
|
|
|
|
return m;
|
2011-06-14 21:19:01 +03:00
|
|
|
|
}
|
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
static const void *mem_need(fmap_t *m, size_t at, size_t len, int lock)
|
2021-10-03 14:13:55 -07:00
|
|
|
|
{
|
2014-07-10 18:11:49 -04:00
|
|
|
|
UNUSEDPARAM(lock);
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (!len) {
|
|
|
|
|
return NULL;
|
2011-06-14 22:54:44 +03:00
|
|
|
|
}
|
2011-06-17 23:08:31 +03:00
|
|
|
|
at += m->nested_offset;
|
2021-10-28 16:58:21 -07:00
|
|
|
|
if (!CLI_ISCONTAINED(m->nested_offset, m->len, at, len)) {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
return NULL;
|
2011-06-14 22:54:44 +03:00
|
|
|
|
}
|
|
|
|
|
|
2009-10-10 19:10:15 +02:00
|
|
|
|
return (void *)((char *)m->data + at);
|
|
|
|
|
}
|
|
|
|
|
|
2014-07-10 18:11:49 -04:00
|
|
|
|
static void mem_unneed_off(fmap_t *m, size_t at, size_t len)
|
|
|
|
|
{
|
|
|
|
|
UNUSEDPARAM(m);
|
|
|
|
|
UNUSEDPARAM(at);
|
|
|
|
|
UNUSEDPARAM(len);
|
|
|
|
|
}
|
2009-10-10 19:10:15 +02:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
static const void *mem_need_offstr(fmap_t *m, size_t at, size_t len_hint)
|
|
|
|
|
{
|
2021-10-03 14:13:55 -07:00
|
|
|
|
char *ptr;
|
|
|
|
|
|
|
|
|
|
at += m->nested_offset;
|
|
|
|
|
ptr = (char *)m->data + at;
|
2009-10-10 19:10:15 +02:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (!len_hint || len_hint > m->real_len - at)
|
|
|
|
|
len_hint = m->real_len - at;
|
2009-10-10 19:10:15 +02:00
|
|
|
|
|
2021-10-28 16:58:21 -07:00
|
|
|
|
if (!CLI_ISCONTAINED(m->nested_offset, m->len, at, len_hint))
|
2018-12-03 12:40:13 -05:00
|
|
|
|
return NULL;
|
2009-10-10 19:10:15 +02:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if (memchr(ptr, 0, len_hint))
|
|
|
|
|
return (void *)ptr;
|
2009-10-10 19:10:15 +02:00
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
static const void *mem_gets(fmap_t *m, char *dst, size_t *at, size_t max_len)
|
|
|
|
|
{
|
2021-10-03 14:13:55 -07:00
|
|
|
|
char *src = (char *)m->data + m->nested_offset + *at;
|
|
|
|
|
char *endptr = NULL;
|
|
|
|
|
size_t len = MIN(max_len - 1, m->len - *at);
|
2009-10-10 19:10:15 +02:00
|
|
|
|
|
2021-10-28 16:58:21 -07:00
|
|
|
|
if (!len || !CLI_ISCONTAINED_0_TO(m->len, *at, len))
|
2018-12-03 12:40:13 -05:00
|
|
|
|
return NULL;
|
2009-10-10 19:10:15 +02:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
if ((endptr = memchr(src, '\n', len))) {
|
|
|
|
|
endptr++;
|
|
|
|
|
memcpy(dst, src, endptr - src);
|
|
|
|
|
dst[endptr - src] = '\0';
|
|
|
|
|
*at += endptr - src;
|
2009-10-10 19:10:15 +02:00
|
|
|
|
} else {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
memcpy(dst, src, len);
|
|
|
|
|
dst[len] = '\0';
|
|
|
|
|
*at += len;
|
2009-10-10 19:10:15 +02:00
|
|
|
|
}
|
|
|
|
|
return dst;
|
|
|
|
|
}
|
|
|
|
|
|
2025-06-08 01:12:33 -04:00
|
|
|
|
fmap_t *fmap_new(int fd, off_t offset, size_t len, const char *name, const char *path)
|
2018-12-03 12:40:13 -05:00
|
|
|
|
{
|
2010-05-05 12:51:59 +02:00
|
|
|
|
int unused;
|
2025-06-08 01:12:33 -04:00
|
|
|
|
return fmap_check_empty(fd, offset, len, &unused, name, path);
|
2010-05-05 12:51:59 +02:00
|
|
|
|
}
|
|
|
|
|
|
2021-03-31 12:16:41 -07:00
|
|
|
|
static inline uint64_t fmap_align_items(uint64_t sz, uint64_t al)
|
2018-12-03 12:40:13 -05:00
|
|
|
|
{
|
2009-10-10 20:46:05 +02:00
|
|
|
|
return sz / al + (sz % al != 0);
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-31 12:16:41 -07:00
|
|
|
|
static inline uint64_t fmap_align_to(uint64_t sz, uint64_t al)
|
2018-12-03 12:40:13 -05:00
|
|
|
|
{
|
2009-10-10 20:46:05 +02:00
|
|
|
|
return al * fmap_align_items(sz, al);
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-31 12:16:41 -07:00
|
|
|
|
static inline uint64_t fmap_which_page(fmap_t *m, size_t at)
|
2018-12-03 12:40:13 -05:00
|
|
|
|
{
|
2009-10-10 20:46:05 +02:00
|
|
|
|
return at / m->pgsz;
|
|
|
|
|
}
|
2011-06-14 18:48:50 +03:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
cl_error_t fmap_dump_to_file(fmap_t *map, const char *filepath, const char *tmpdir, char **outname, int *outfd, size_t start_offset, size_t end_offset)
|
2013-11-22 19:41:46 -05:00
|
|
|
|
{
|
2018-07-30 20:19:28 -04:00
|
|
|
|
cl_error_t ret = CL_EARG;
|
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
char *filebase = NULL;
|
|
|
|
|
char *prefix = NULL;
|
2018-07-30 20:19:28 -04:00
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
char *tmpname = NULL;
|
|
|
|
|
int tmpfd = -1;
|
2018-07-30 20:19:28 -04:00
|
|
|
|
|
|
|
|
|
size_t pos = 0, len = 0, bytes_remaining = 0, write_size = 0;
|
|
|
|
|
|
|
|
|
|
if ((start_offset > map->real_len) || (end_offset < start_offset)) {
|
|
|
|
|
cli_dbgmsg("fmap_dump_to_file: Invalid offset arguments: start %zu, end %zu\n", start_offset, end_offset);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
pos = start_offset;
|
|
|
|
|
end_offset = MIN(end_offset, map->real_len);
|
2018-07-30 20:19:28 -04:00
|
|
|
|
bytes_remaining = end_offset - start_offset;
|
|
|
|
|
|
|
|
|
|
/* Create a filename prefix that includes the original filename, if available */
|
|
|
|
|
if (filepath != NULL) {
|
2025-08-11 18:02:09 -04:00
|
|
|
|
if (CL_SUCCESS != cli_basename(filepath, strlen(filepath), &filebase, true /* posix_support_backslash_pathsep */)) {
|
2018-07-30 20:19:28 -04:00
|
|
|
|
cli_dbgmsg("fmap_dump_to_file: Unable to determine basename from filepath.\n");
|
|
|
|
|
} else if ((start_offset != 0) && (end_offset != map->real_len)) {
|
2023-11-26 15:01:19 -08:00
|
|
|
|
/* If we're only dumping a portion of the file, include the offsets in the prefix,...
|
2022-02-16 00:13:55 +01:00
|
|
|
|
* e.g. tmp filename will become something like: filebase.500-1200.<randhex> */
|
2021-03-31 12:16:41 -07:00
|
|
|
|
size_t prefix_len = strlen(filebase) + 1 + SIZE_T_CHARLEN + 1 + SIZE_T_CHARLEN + 1;
|
|
|
|
|
prefix = malloc(prefix_len);
|
2018-07-30 20:19:28 -04:00
|
|
|
|
if (NULL == prefix) {
|
|
|
|
|
cli_errmsg("fmap_dump_to_file: Failed to allocate memory for tempfile prefix.\n");
|
2021-10-28 16:58:21 -07:00
|
|
|
|
free(filebase);
|
2018-07-30 20:19:28 -04:00
|
|
|
|
return CL_EMEM;
|
|
|
|
|
}
|
|
|
|
|
snprintf(prefix, prefix_len, "%s.%zu-%zu", filebase, start_offset, end_offset);
|
2018-10-01 19:46:23 -04:00
|
|
|
|
|
|
|
|
|
free(filebase);
|
|
|
|
|
filebase = NULL;
|
2018-07-30 20:19:28 -04:00
|
|
|
|
} else {
|
|
|
|
|
/* Else if we're dumping the whole thing, use the filebase as the prefix */
|
2018-12-03 12:40:13 -05:00
|
|
|
|
prefix = filebase;
|
2018-10-01 19:46:23 -04:00
|
|
|
|
filebase = NULL;
|
2018-07-30 20:19:28 -04:00
|
|
|
|
}
|
|
|
|
|
}
|
2013-11-22 19:41:46 -05:00
|
|
|
|
|
|
|
|
|
cli_dbgmsg("fmap_dump_to_file: dumping fmap not backed by file...\n");
|
2018-07-30 20:19:28 -04:00
|
|
|
|
ret = cli_gentempfd_with_prefix(tmpdir, prefix, &tmpname, &tmpfd);
|
|
|
|
|
if (ret != CL_SUCCESS) {
|
2013-11-22 19:41:46 -05:00
|
|
|
|
cli_dbgmsg("fmap_dump_to_file: failed to generate temporary file.\n");
|
2018-10-01 19:46:23 -04:00
|
|
|
|
if (NULL != prefix) {
|
2018-07-30 20:19:28 -04:00
|
|
|
|
free(prefix);
|
|
|
|
|
prefix = NULL;
|
|
|
|
|
}
|
2013-11-22 19:41:46 -05:00
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2018-10-01 19:46:23 -04:00
|
|
|
|
if (NULL != prefix) {
|
2018-07-30 20:19:28 -04:00
|
|
|
|
free(prefix);
|
|
|
|
|
prefix = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2013-11-22 19:41:46 -05:00
|
|
|
|
do {
|
2018-12-03 12:40:13 -05:00
|
|
|
|
const char *b;
|
|
|
|
|
len = 0;
|
2018-07-30 20:19:28 -04:00
|
|
|
|
write_size = MIN(BUFSIZ, bytes_remaining);
|
|
|
|
|
|
|
|
|
|
b = fmap_need_off_once_len(map, pos, write_size, &len);
|
2013-11-22 19:41:46 -05:00
|
|
|
|
pos += len;
|
2018-07-30 20:19:28 -04:00
|
|
|
|
if (b && (len > 0)) {
|
2019-05-04 15:54:54 -04:00
|
|
|
|
if (cli_writen(tmpfd, b, len) != len) {
|
2013-11-22 19:41:46 -05:00
|
|
|
|
cli_warnmsg("fmap_dump_to_file: write failed to %s!\n", tmpname);
|
|
|
|
|
close(tmpfd);
|
|
|
|
|
unlink(tmpname);
|
|
|
|
|
free(tmpname);
|
|
|
|
|
return CL_EWRITE;
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-07-30 20:19:28 -04:00
|
|
|
|
if (len <= bytes_remaining) {
|
|
|
|
|
bytes_remaining -= len;
|
|
|
|
|
} else {
|
|
|
|
|
bytes_remaining = 0;
|
|
|
|
|
}
|
|
|
|
|
} while ((len > 0) && (bytes_remaining > 0));
|
2013-11-22 19:41:46 -05:00
|
|
|
|
|
2018-07-30 20:19:28 -04:00
|
|
|
|
if (lseek(tmpfd, 0, SEEK_SET) == -1) {
|
2013-11-22 19:41:46 -05:00
|
|
|
|
cli_dbgmsg("fmap_dump_to_file: lseek failed\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*outname = tmpname;
|
2018-12-03 12:40:13 -05:00
|
|
|
|
*outfd = tmpfd;
|
2013-11-22 19:41:46 -05:00
|
|
|
|
return CL_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-03 12:40:13 -05:00
|
|
|
|
int fmap_fd(fmap_t *m)
|
2011-06-14 18:48:50 +03:00
|
|
|
|
{
|
2014-07-10 18:11:49 -04:00
|
|
|
|
int fd;
|
2020-04-14 11:52:12 -04:00
|
|
|
|
if (NULL == m) {
|
|
|
|
|
cli_errmsg("fmap_fd: Attempted to get fd for NULL fmap\n");
|
2020-04-18 10:46:57 -04:00
|
|
|
|
return -1;
|
2020-04-14 11:52:12 -04:00
|
|
|
|
}
|
2020-04-18 10:46:57 -04:00
|
|
|
|
if (!m->handle_is_fd) {
|
2018-07-30 20:19:28 -04:00
|
|
|
|
return -1;
|
2020-04-18 10:46:57 -04:00
|
|
|
|
}
|
2017-08-31 16:38:41 -04:00
|
|
|
|
fd = (int)(ptrdiff_t)m->handle;
|
2011-06-14 18:48:50 +03:00
|
|
|
|
lseek(fd, 0, SEEK_SET);
|
|
|
|
|
return fd;
|
|
|
|
|
}
|
2011-06-17 23:14:36 +03:00
|
|
|
|
|
2025-06-03 19:03:20 -04:00
|
|
|
|
cl_error_t fmap_set_hash(fmap_t *map, uint8_t *hash, cli_hash_type_t type)
|
2022-08-18 20:00:33 -07:00
|
|
|
|
{
|
2025-06-03 19:03:20 -04:00
|
|
|
|
cl_error_t status = CL_ERROR;
|
2022-08-18 20:00:33 -07:00
|
|
|
|
|
|
|
|
|
if (NULL == map) {
|
|
|
|
|
cli_errmsg("fmap_set_hash: Attempted to set hash for NULL fmap\n");
|
|
|
|
|
status = CL_EARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
if (NULL == hash) {
|
|
|
|
|
cli_errmsg("fmap_set_hash: Attempted to set hash to NULL\n");
|
|
|
|
|
status = CL_EARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
2025-06-03 19:03:20 -04:00
|
|
|
|
if (type >= CLI_HASH_AVAIL_TYPES) {
|
|
|
|
|
cli_errmsg("fmap_set_hash: Unsupported hash type %u\n", type);
|
|
|
|
|
status = CL_EARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
memcpy(map->hash[type], hash, cli_hash_len(type));
|
|
|
|
|
map->have_hash[type] = true;
|
|
|
|
|
status = CL_SUCCESS;
|
|
|
|
|
|
libclamav: Fix issue reporting trusted verdicts
If the outermost layer is trusted (e.g. using an FP signature), the verdict passed
back by the `cl_scan*_ex()` functions should be CL_VERDICT_TRUSTED.
To ensure this, and other correct verdicts, I moved the logic setting the verdict
to occur when adding indicators, or trusting a given layer. Then at the end of a
scan, it will set the output verdict parameter to the top level verdict.
This commit also:
* Fixes a bug in the `ex_scan_callbacks` program where a crash would happen when
a hash was retrieved for an inner layer, but isn't for the container.
* Added debug logs whenever a hash is calculated or set, printing the hash type
and hash string.
* When a layer is trusted, in addition to removing evidence for that layer, it
will also index the metadata JSON (if that feature is enabled) and will rename
any "Viruses" to "IgnoredAlerts", and rename "ContainedIndicators" to
"IgnoredContainedIndicators".
* Fixed an issue where setting the hash algorithm with extra characters, like
setting to "sha256789" would ignore the extra characters, and report the hash
type as the same. It will now fail if the string length differs from the known
hash algorithm.
2025-08-07 13:22:57 -04:00
|
|
|
|
if (cli_debug_flag) {
|
|
|
|
|
// Convert the hash to a hex string for logging
|
|
|
|
|
char hash_string[CLI_HASHLEN_MAX * 2 + 1] = {0};
|
|
|
|
|
size_t hash_len = cli_hash_len(type);
|
|
|
|
|
|
|
|
|
|
// Convert hash to string.
|
|
|
|
|
size_t i;
|
|
|
|
|
for (i = 0; i < hash_len; i++) {
|
|
|
|
|
sprintf(hash_string + i * 2, "%02x", map->hash[type][i]);
|
|
|
|
|
}
|
|
|
|
|
hash_string[hash_len * 2] = 0;
|
|
|
|
|
|
|
|
|
|
cli_dbgmsg("fmap_set_hash: set %s hash: %s\n", cli_hash_name(type), hash_string);
|
|
|
|
|
}
|
|
|
|
|
|
2025-06-03 19:03:20 -04:00
|
|
|
|
done:
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cl_error_t fmap_will_need_hash_later(fmap_t *map, cli_hash_type_t type)
|
|
|
|
|
{
|
|
|
|
|
cl_error_t status = CL_ERROR;
|
|
|
|
|
|
|
|
|
|
if (NULL == map) {
|
|
|
|
|
cli_errmsg("fmap_will_need_hash_later: Attempted to set hash for NULL fmap\n");
|
|
|
|
|
status = CL_EARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (type >= CLI_HASH_AVAIL_TYPES) {
|
|
|
|
|
cli_errmsg("fmap_will_need_hash_later: Unsupported hash type %u\n", type);
|
|
|
|
|
status = CL_EARG;
|
|
|
|
|
goto done;
|
2022-08-18 20:00:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
2025-06-03 19:03:20 -04:00
|
|
|
|
/* set flags */
|
|
|
|
|
map->will_need_hash[type] = true;
|
|
|
|
|
status = CL_SUCCESS;
|
|
|
|
|
|
2022-08-18 20:00:33 -07:00
|
|
|
|
done:
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cl_error_t fmap_get_hash(fmap_t *map, unsigned char **hash, cli_hash_type_t type)
|
2020-02-23 12:38:18 -05:00
|
|
|
|
{
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
cl_error_t status = CL_ERROR;
|
2020-02-23 12:38:18 -05:00
|
|
|
|
size_t todo, at = 0;
|
2025-06-03 19:03:20 -04:00
|
|
|
|
void *hashctx[CLI_HASH_AVAIL_TYPES] = {NULL};
|
|
|
|
|
cli_hash_type_t hash_type;
|
2020-02-23 12:38:18 -05:00
|
|
|
|
|
|
|
|
|
todo = map->len;
|
|
|
|
|
|
2025-06-03 19:03:20 -04:00
|
|
|
|
if (type >= CLI_HASH_AVAIL_TYPES) {
|
|
|
|
|
cli_errmsg("fmap_get_hash: Unsupported hash type %u\n", type);
|
|
|
|
|
status = CL_EARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* If we already have the hash, just return it */
|
|
|
|
|
if (map->have_hash[type]) {
|
|
|
|
|
goto complete;
|
2022-08-18 20:00:33 -07:00
|
|
|
|
}
|
2020-02-23 12:38:18 -05:00
|
|
|
|
|
2025-06-03 19:03:20 -04:00
|
|
|
|
map->will_need_hash[type] = true;
|
|
|
|
|
|
2022-08-18 20:00:33 -07:00
|
|
|
|
/*
|
2025-06-03 19:03:20 -04:00
|
|
|
|
* Need to calculate the requested hash and maybe others, as well.
|
2022-08-18 20:00:33 -07:00
|
|
|
|
*/
|
2020-02-23 12:38:18 -05:00
|
|
|
|
|
2025-06-03 19:03:20 -04:00
|
|
|
|
/* Initialize hash contexts for all needed hash types */
|
|
|
|
|
for (hash_type = CLI_HASH_MD5; hash_type < CLI_HASH_AVAIL_TYPES; hash_type++) {
|
|
|
|
|
if (map->will_need_hash[hash_type] && !map->have_hash[hash_type]) {
|
|
|
|
|
const char *hash_name = cli_hash_name(hash_type);
|
|
|
|
|
|
|
|
|
|
hashctx[hash_type] = cl_hash_init(hash_name);
|
|
|
|
|
if (NULL == hashctx[hash_type]) {
|
|
|
|
|
cli_errmsg("fmap_get_hash: error initializing %s hash context\n", hash_name);
|
|
|
|
|
status = CL_EARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-08-18 20:00:33 -07:00
|
|
|
|
}
|
2020-02-23 12:38:18 -05:00
|
|
|
|
|
2022-08-18 20:00:33 -07:00
|
|
|
|
while (todo) {
|
|
|
|
|
const void *buf;
|
|
|
|
|
size_t readme = todo < 1024 * 1024 * 10 ? todo : 1024 * 1024 * 10;
|
2020-02-23 12:38:18 -05:00
|
|
|
|
|
2022-08-18 20:00:33 -07:00
|
|
|
|
if (!(buf = fmap_need_off_once(map, at, readme))) {
|
|
|
|
|
cli_errmsg("fmap_get_hash: error reading while generating hash!\n");
|
|
|
|
|
status = CL_EREAD;
|
|
|
|
|
goto done;
|
2020-02-23 12:38:18 -05:00
|
|
|
|
}
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
|
2022-08-18 20:00:33 -07:00
|
|
|
|
todo -= readme;
|
|
|
|
|
at += readme;
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
|
2025-06-03 19:03:20 -04:00
|
|
|
|
for (hash_type = CLI_HASH_MD5; hash_type < CLI_HASH_AVAIL_TYPES; hash_type++) {
|
|
|
|
|
if (map->will_need_hash[hash_type] && !map->have_hash[hash_type]) {
|
|
|
|
|
if (cl_update_hash(hashctx[hash_type], buf, readme)) {
|
|
|
|
|
const char *hash_name = cli_hash_name(hash_type);
|
|
|
|
|
cli_errmsg("fmap_get_hash: error calculating %s hash!\n", hash_name);
|
|
|
|
|
status = CL_EREAD;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-08-18 20:00:33 -07:00
|
|
|
|
}
|
2020-02-23 12:38:18 -05:00
|
|
|
|
}
|
|
|
|
|
|
2025-06-03 19:03:20 -04:00
|
|
|
|
for (hash_type = CLI_HASH_MD5; hash_type < CLI_HASH_AVAIL_TYPES; hash_type++) {
|
|
|
|
|
if (map->will_need_hash[hash_type] && !map->have_hash[hash_type]) {
|
|
|
|
|
cl_finish_hash(hashctx[hash_type], map->hash[hash_type]);
|
|
|
|
|
map->have_hash[hash_type] = true;
|
|
|
|
|
|
|
|
|
|
/* hashctx is finished, don't need to destroy it later */
|
|
|
|
|
hashctx[hash_type] = NULL;
|
libclamav: Fix issue reporting trusted verdicts
If the outermost layer is trusted (e.g. using an FP signature), the verdict passed
back by the `cl_scan*_ex()` functions should be CL_VERDICT_TRUSTED.
To ensure this, and other correct verdicts, I moved the logic setting the verdict
to occur when adding indicators, or trusting a given layer. Then at the end of a
scan, it will set the output verdict parameter to the top level verdict.
This commit also:
* Fixes a bug in the `ex_scan_callbacks` program where a crash would happen when
a hash was retrieved for an inner layer, but isn't for the container.
* Added debug logs whenever a hash is calculated or set, printing the hash type
and hash string.
* When a layer is trusted, in addition to removing evidence for that layer, it
will also index the metadata JSON (if that feature is enabled) and will rename
any "Viruses" to "IgnoredAlerts", and rename "ContainedIndicators" to
"IgnoredContainedIndicators".
* Fixed an issue where setting the hash algorithm with extra characters, like
setting to "sha256789" would ignore the extra characters, and report the hash
type as the same. It will now fail if the string length differs from the known
hash algorithm.
2025-08-07 13:22:57 -04:00
|
|
|
|
|
|
|
|
|
if (cli_debug_flag) {
|
|
|
|
|
// Convert the hash to a hex string for logging
|
|
|
|
|
char hash_string[CLI_HASHLEN_MAX * 2 + 1] = {0};
|
|
|
|
|
size_t hash_len = cli_hash_len(hash_type);
|
|
|
|
|
|
|
|
|
|
// Convert hash to string.
|
|
|
|
|
size_t i;
|
|
|
|
|
for (i = 0; i < hash_len; i++) {
|
|
|
|
|
sprintf(hash_string + i * 2, "%02x", map->hash[hash_type][i]);
|
|
|
|
|
}
|
|
|
|
|
hash_string[hash_len * 2] = 0;
|
|
|
|
|
|
|
|
|
|
cli_dbgmsg("fmap_get_hash: calculated %s hash: %s\n", cli_hash_name(hash_type), hash_string);
|
|
|
|
|
}
|
2025-06-03 19:03:20 -04:00
|
|
|
|
}
|
2022-08-18 20:00:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
complete:
|
|
|
|
|
|
2025-06-03 19:03:20 -04:00
|
|
|
|
*hash = map->hash[type];
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
status = CL_SUCCESS;
|
|
|
|
|
|
|
|
|
|
done:
|
|
|
|
|
|
2025-06-03 19:03:20 -04:00
|
|
|
|
for (hash_type = CLI_HASH_MD5; hash_type < CLI_HASH_AVAIL_TYPES; hash_type++) {
|
|
|
|
|
if (NULL != hashctx[hash_type]) {
|
|
|
|
|
cl_hash_destroy(hashctx[hash_type]);
|
|
|
|
|
}
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
}
|
2020-02-23 12:38:18 -05:00
|
|
|
|
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │  └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
|   └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │  └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │  └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │   └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │  └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │  └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │  └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
|   └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │  └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │  └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
|   └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
2021-09-11 14:15:21 -07:00
|
|
|
|
return status;
|
2020-02-23 12:38:18 -05:00
|
|
|
|
}
|
libclamav: scan-layer callback API functions
Add the following scan callbacks:
```c
cl_engine_set_scan_callback(engine, &pre_hash_callback, CL_SCAN_CALLBACK_PRE_HASH);
cl_engine_set_scan_callback(engine, &pre_scan_callback, CL_SCAN_CALLBACK_PRE_SCAN);
cl_engine_set_scan_callback(engine, &post_scan_callback, CL_SCAN_CALLBACK_POST_SCAN);
cl_engine_set_scan_callback(engine, &alert_callback, CL_SCAN_CALLBACK_ALERT);
cl_engine_set_scan_callback(engine, &file_type_callback, CL_SCAN_CALLBACK_FILE_TYPE);
```
Each callback may alter scan behavior using the following return codes:
* CL_BREAK
Scan aborted by callback (the rest of the scan is skipped).
This does not mark the file as clean or infected, it just skips the rest of the scan.
* CL_SUCCESS / CL_CLEAN
File scan will continue.
This is different than CL_VERIFIED because it does not affect prior or future alerts.
Return CL_VERIFIED instead if you want to remove prior alerts for this layer and skip
the rest of the scan for this layer.
* CL_VIRUS
This means you don't trust the file. A new alert will be added.
For CL_SCAN_CALLBACK_ALERT: Means you agree with the alert (no extra alert needed).
* CL_VERIFIED
Layer explicitly trusted by the callback and previous alerts removed FOR THIS layer.
You might want to do this if you trust the hash or verified a digital signature.
The rest of the scan will be skipped FOR THIS layer.
For contained files, this does NOT mean that the parent or adjacent layers are trusted.
Each callback is given a pointer to the current scan layer from which
they can get previous layers, can get the the layer's fmap, and then
various attributes of the layer and of the fmap such as:
- layer recursion level
- layer object id
- layer file type
- layer attributes (was decerypted, normalized, embedded, or re-typed)
- layer last alert
- fmap name
- fmap hash (md5, sha1, or sha2-256)
- fmap data (pointer and size)
- fmap file descriptor, if any (fd, offset, size)
- fmap filepath, if any (filepath, offset, size)
To make this possible, this commits introduced a handful of new APIs to
query scan-layer details and fmap details:
- `cl_error_t cl_fmap_set_name(cl_fmap_t *map, const char *name);`
- `cl_error_t cl_fmap_get_name(cl_fmap_t *map, const char **name_out);`
- `cl_error_t cl_fmap_set_path(cl_fmap_t *map, const char *path);`
- `cl_error_t cl_fmap_get_path(cl_fmap_t *map, const char **path_out, size_t *offset_out, size_t *len_out);`
- `cl_error_t cl_fmap_get_fd(const cl_fmap_t *map, int *fd_out, size_t *offset_out, size_t *len_out);`
- `cl_error_t cl_fmap_get_size(const cl_fmap_t *map, size_t *size_out);`
- `cl_error_t cl_fmap_set_hash(const cl_fmap_t *map, const char *hash_alg, char hash);`
- `cl_error_t cl_fmap_have_hash(const cl_fmap_t *map, const char *hash_alg, bool *have_hash_out);`
- `cl_error_t cl_fmap_will_need_hash_later(const cl_fmap_t *map, const char *hash_alg);`
- `cl_error_t cl_fmap_get_hash(const cl_fmap_t *map, const char *hash_alg, const char **hash_out);`
- `cl_error_t cl_fmap_get_data(const cl_fmap_t *map, size_t offset, size_t len, const uint8_t **data_out, size_t *data_len_out);`
- `cl_error_t cl_scan_layer_get_fmap(cl_scan_layer_t *layer, cl_fmap_t **fmap_out);`
- `cl_error_t cl_scan_layer_get_parent_layer(cl_scan_layer_t *layer, cl_scan_layer_t **parent_layer_out);`
- `cl_error_t cl_scan_layer_get_type(cl_scan_layer_t *layer, const char **type_out);`
- `cl_error_t cl_scan_layer_get_recursion_level(cl_scan_layer_t *layer, uint32_t *recursion_level_out);`
- `cl_error_t cl_scan_layer_get_object_id(cl_scan_layer_t *layer, uint64_t *object_id_out);`
- `cl_error_t cl_scan_layer_get_last_alert(cl_scan_layer_t *layer, const char **alert_name_out);`
- `cl_error_t cl_scan_layer_get_attributes(cl_scan_layer_t *layer, uint32_t *attributes_out);`
This commit deprecates but does not remove the existing scan callbacks:
- `void cl_engine_set_clcb_pre_cache(struct cl_engine *engine, clcb_pre_cache callback);`
- `void cl_engine_set_clcb_file_inspection(struct cl_engine *engine, clcb_file_inspection callback);`
- `void cl_engine_set_clcb_pre_scan(struct cl_engine *engine, clcb_pre_scan callback);`
- `void cl_engine_set_clcb_post_scan(struct cl_engine *engine, clcb_post_scan callback);`
- `void cl_engine_set_clcb_virus_found(struct cl_engine *engine, clcb_virus_found callback);`
- `void cl_engine_set_clcb_hash(struct cl_engine *engine, clcb_hash callback);`
This commit also adds an interactive test program to demonstrate the callbacks.
See: `examples/ex_scan_callbacks.c`
CLAM-255
CLAM-2485
CLAM-2626
2025-06-22 14:37:03 -04:00
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Public API functions.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
extern cl_fmap_t *cl_fmap_open_handle(
|
|
|
|
|
void *handle,
|
|
|
|
|
size_t offset,
|
|
|
|
|
size_t len,
|
|
|
|
|
clcb_pread pread_cb,
|
|
|
|
|
int use_aging)
|
|
|
|
|
{
|
|
|
|
|
return fmap_open_handle(handle, offset, len, pread_cb, use_aging);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
extern cl_fmap_t *cl_fmap_open_memory(const void *start, size_t len)
|
|
|
|
|
{
|
|
|
|
|
return (cl_fmap_t *)fmap_open_memory(start, len, NULL);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
extern cl_error_t cl_fmap_set_name(cl_fmap_t *map, const char *name)
|
|
|
|
|
{
|
|
|
|
|
cl_error_t status = CL_ERROR;
|
|
|
|
|
|
|
|
|
|
if (!map || !name) {
|
|
|
|
|
status = CL_ENULLARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
free(map->name);
|
|
|
|
|
|
|
|
|
|
map->name = cli_safer_strdup(name);
|
|
|
|
|
if (!map->name) {
|
|
|
|
|
status = CL_EMEM;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
status = CL_SUCCESS;
|
|
|
|
|
|
|
|
|
|
done:
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
extern cl_error_t cl_fmap_get_name(cl_fmap_t *map, const char **name_out)
|
|
|
|
|
{
|
|
|
|
|
cl_error_t status = CL_ERROR;
|
|
|
|
|
|
|
|
|
|
if (!map || !name_out) {
|
|
|
|
|
status = CL_ENULLARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*name_out = map->name;
|
|
|
|
|
status = CL_SUCCESS;
|
|
|
|
|
|
|
|
|
|
done:
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
extern cl_error_t cl_fmap_set_path(cl_fmap_t *map, const char *path)
|
|
|
|
|
{
|
|
|
|
|
cl_error_t status = CL_ERROR;
|
|
|
|
|
|
|
|
|
|
if (!map || !path) {
|
|
|
|
|
status = CL_ENULLARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
free(map->path);
|
|
|
|
|
|
|
|
|
|
map->path = cli_safer_strdup(path);
|
|
|
|
|
if (!map->path) {
|
|
|
|
|
status = CL_EMEM;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
status = CL_SUCCESS;
|
|
|
|
|
|
|
|
|
|
done:
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
extern cl_error_t cl_fmap_get_path(cl_fmap_t *map, const char **path_out, size_t *offset_out, size_t *len_out)
|
|
|
|
|
{
|
|
|
|
|
cl_error_t status = CL_ERROR;
|
|
|
|
|
|
|
|
|
|
if (!map || !path_out) {
|
|
|
|
|
status = CL_ENULLARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*path_out = map->path;
|
|
|
|
|
if (NULL == *path_out) {
|
|
|
|
|
status = CL_EACCES;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (offset_out) {
|
|
|
|
|
*offset_out = map->offset;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (len_out) {
|
|
|
|
|
*len_out = map->len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
status = CL_SUCCESS;
|
|
|
|
|
|
|
|
|
|
done:
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
extern cl_error_t cl_fmap_get_fd(const cl_fmap_t *map, int *fd_out, size_t *offset_out, size_t *len_out)
|
|
|
|
|
{
|
|
|
|
|
cl_error_t status = CL_ERROR;
|
|
|
|
|
|
|
|
|
|
if (!map || !fd_out) {
|
|
|
|
|
status = CL_ENULLARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*fd_out = fmap_fd((fmap_t *)map);
|
|
|
|
|
if (*fd_out == -1) {
|
|
|
|
|
status = CL_EACCES;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (offset_out) {
|
|
|
|
|
*offset_out = map->offset;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (len_out) {
|
|
|
|
|
*len_out = map->len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
status = CL_SUCCESS;
|
|
|
|
|
|
|
|
|
|
done:
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
extern cl_error_t cl_fmap_get_size(const cl_fmap_t *map, size_t *size_out)
|
|
|
|
|
{
|
|
|
|
|
cl_error_t status = CL_ERROR;
|
|
|
|
|
|
|
|
|
|
if (!map || !size_out) {
|
|
|
|
|
status = CL_ENULLARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*size_out = map->len;
|
|
|
|
|
status = CL_SUCCESS;
|
|
|
|
|
|
|
|
|
|
done:
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
extern cl_error_t cl_fmap_set_hash(const cl_fmap_t *map, const char *hash_alg, char hash)
|
|
|
|
|
{
|
|
|
|
|
cl_error_t status = CL_ERROR;
|
|
|
|
|
cli_hash_type_t type;
|
|
|
|
|
|
|
|
|
|
if (!map || !hash_alg) {
|
|
|
|
|
status = CL_ENULLARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
status = cli_hash_type_from_name(hash_alg, &type);
|
|
|
|
|
if (status != CL_SUCCESS) {
|
|
|
|
|
cli_errmsg("cl_fmap_set_hash: Unknown hash algorithm: %s\n", hash_alg);
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (type >= CLI_HASH_AVAIL_TYPES) {
|
|
|
|
|
cli_errmsg("cl_fmap_set_hash: Unsupported hash algorithm: %s\n", hash_alg);
|
|
|
|
|
status = CL_EARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
status = fmap_set_hash((fmap_t *)map, (uint8_t *)&hash, type);
|
|
|
|
|
if (status != CL_SUCCESS) {
|
|
|
|
|
cli_errmsg("cl_fmap_set_hash: Failed to set hash for algorithm: %s\n", hash_alg);
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
done:
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
extern cl_error_t cl_fmap_have_hash(const cl_fmap_t *map, const char *hash_alg, bool *have_hash_out)
|
|
|
|
|
{
|
|
|
|
|
cl_error_t status = CL_ERROR;
|
|
|
|
|
cli_hash_type_t type;
|
|
|
|
|
|
|
|
|
|
if (!map || !hash_alg || !have_hash_out) {
|
|
|
|
|
status = CL_ENULLARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
status = cli_hash_type_from_name(hash_alg, &type);
|
|
|
|
|
if (status != CL_SUCCESS) {
|
|
|
|
|
cli_errmsg("cl_fmap_have_hash: Unknown hash algorithm: %s\n", hash_alg);
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (type >= CLI_HASH_AVAIL_TYPES) {
|
|
|
|
|
cli_errmsg("cl_fmap_have_hash: Unsupported hash algorithm: %s\n", hash_alg);
|
|
|
|
|
status = CL_EARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*have_hash_out = map->have_hash[type];
|
|
|
|
|
status = CL_SUCCESS;
|
|
|
|
|
|
|
|
|
|
done:
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
extern cl_error_t cl_fmap_will_need_hash_later(const cl_fmap_t *map, const char *hash_alg)
|
|
|
|
|
{
|
|
|
|
|
cl_error_t status = CL_ERROR;
|
|
|
|
|
cli_hash_type_t type;
|
|
|
|
|
|
|
|
|
|
if (!map || !hash_alg) {
|
|
|
|
|
status = CL_ENULLARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
status = cli_hash_type_from_name(hash_alg, &type);
|
|
|
|
|
if (status != CL_SUCCESS) {
|
|
|
|
|
cli_errmsg("cl_fmap_will_need_hash_later: Unknown hash algorithm: %s\n", hash_alg);
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (type >= CLI_HASH_AVAIL_TYPES) {
|
|
|
|
|
cli_errmsg("cl_fmap_will_need_hash_later: Unsupported hash algorithm: %s\n", hash_alg);
|
|
|
|
|
status = CL_EARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
status = fmap_will_need_hash_later((fmap_t *)map, type);
|
|
|
|
|
if (status != CL_SUCCESS) {
|
|
|
|
|
cli_errmsg("cl_fmap_will_need_hash_later: Failed to indicate need for hash algorithm: %s\n", hash_alg);
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
status = CL_SUCCESS;
|
|
|
|
|
|
|
|
|
|
done:
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
2025-07-23 00:23:44 -04:00
|
|
|
|
extern cl_error_t cl_fmap_get_hash(const cl_fmap_t *map, const char *hash_alg, char **hash_out)
|
libclamav: scan-layer callback API functions
Add the following scan callbacks:
```c
cl_engine_set_scan_callback(engine, &pre_hash_callback, CL_SCAN_CALLBACK_PRE_HASH);
cl_engine_set_scan_callback(engine, &pre_scan_callback, CL_SCAN_CALLBACK_PRE_SCAN);
cl_engine_set_scan_callback(engine, &post_scan_callback, CL_SCAN_CALLBACK_POST_SCAN);
cl_engine_set_scan_callback(engine, &alert_callback, CL_SCAN_CALLBACK_ALERT);
cl_engine_set_scan_callback(engine, &file_type_callback, CL_SCAN_CALLBACK_FILE_TYPE);
```
Each callback may alter scan behavior using the following return codes:
* CL_BREAK
Scan aborted by callback (the rest of the scan is skipped).
This does not mark the file as clean or infected, it just skips the rest of the scan.
* CL_SUCCESS / CL_CLEAN
File scan will continue.
This is different than CL_VERIFIED because it does not affect prior or future alerts.
Return CL_VERIFIED instead if you want to remove prior alerts for this layer and skip
the rest of the scan for this layer.
* CL_VIRUS
This means you don't trust the file. A new alert will be added.
For CL_SCAN_CALLBACK_ALERT: Means you agree with the alert (no extra alert needed).
* CL_VERIFIED
Layer explicitly trusted by the callback and previous alerts removed FOR THIS layer.
You might want to do this if you trust the hash or verified a digital signature.
The rest of the scan will be skipped FOR THIS layer.
For contained files, this does NOT mean that the parent or adjacent layers are trusted.
Each callback is given a pointer to the current scan layer from which
they can get previous layers, can get the the layer's fmap, and then
various attributes of the layer and of the fmap such as:
- layer recursion level
- layer object id
- layer file type
- layer attributes (was decerypted, normalized, embedded, or re-typed)
- layer last alert
- fmap name
- fmap hash (md5, sha1, or sha2-256)
- fmap data (pointer and size)
- fmap file descriptor, if any (fd, offset, size)
- fmap filepath, if any (filepath, offset, size)
To make this possible, this commits introduced a handful of new APIs to
query scan-layer details and fmap details:
- `cl_error_t cl_fmap_set_name(cl_fmap_t *map, const char *name);`
- `cl_error_t cl_fmap_get_name(cl_fmap_t *map, const char **name_out);`
- `cl_error_t cl_fmap_set_path(cl_fmap_t *map, const char *path);`
- `cl_error_t cl_fmap_get_path(cl_fmap_t *map, const char **path_out, size_t *offset_out, size_t *len_out);`
- `cl_error_t cl_fmap_get_fd(const cl_fmap_t *map, int *fd_out, size_t *offset_out, size_t *len_out);`
- `cl_error_t cl_fmap_get_size(const cl_fmap_t *map, size_t *size_out);`
- `cl_error_t cl_fmap_set_hash(const cl_fmap_t *map, const char *hash_alg, char hash);`
- `cl_error_t cl_fmap_have_hash(const cl_fmap_t *map, const char *hash_alg, bool *have_hash_out);`
- `cl_error_t cl_fmap_will_need_hash_later(const cl_fmap_t *map, const char *hash_alg);`
- `cl_error_t cl_fmap_get_hash(const cl_fmap_t *map, const char *hash_alg, const char **hash_out);`
- `cl_error_t cl_fmap_get_data(const cl_fmap_t *map, size_t offset, size_t len, const uint8_t **data_out, size_t *data_len_out);`
- `cl_error_t cl_scan_layer_get_fmap(cl_scan_layer_t *layer, cl_fmap_t **fmap_out);`
- `cl_error_t cl_scan_layer_get_parent_layer(cl_scan_layer_t *layer, cl_scan_layer_t **parent_layer_out);`
- `cl_error_t cl_scan_layer_get_type(cl_scan_layer_t *layer, const char **type_out);`
- `cl_error_t cl_scan_layer_get_recursion_level(cl_scan_layer_t *layer, uint32_t *recursion_level_out);`
- `cl_error_t cl_scan_layer_get_object_id(cl_scan_layer_t *layer, uint64_t *object_id_out);`
- `cl_error_t cl_scan_layer_get_last_alert(cl_scan_layer_t *layer, const char **alert_name_out);`
- `cl_error_t cl_scan_layer_get_attributes(cl_scan_layer_t *layer, uint32_t *attributes_out);`
This commit deprecates but does not remove the existing scan callbacks:
- `void cl_engine_set_clcb_pre_cache(struct cl_engine *engine, clcb_pre_cache callback);`
- `void cl_engine_set_clcb_file_inspection(struct cl_engine *engine, clcb_file_inspection callback);`
- `void cl_engine_set_clcb_pre_scan(struct cl_engine *engine, clcb_pre_scan callback);`
- `void cl_engine_set_clcb_post_scan(struct cl_engine *engine, clcb_post_scan callback);`
- `void cl_engine_set_clcb_virus_found(struct cl_engine *engine, clcb_virus_found callback);`
- `void cl_engine_set_clcb_hash(struct cl_engine *engine, clcb_hash callback);`
This commit also adds an interactive test program to demonstrate the callbacks.
See: `examples/ex_scan_callbacks.c`
CLAM-255
CLAM-2485
CLAM-2626
2025-06-22 14:37:03 -04:00
|
|
|
|
{
|
|
|
|
|
cl_error_t status = CL_ERROR;
|
|
|
|
|
cli_hash_type_t type;
|
|
|
|
|
unsigned char *hash;
|
|
|
|
|
char *hash_string = NULL;
|
|
|
|
|
size_t hash_len;
|
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
|
|
if (!map || !hash_alg || !hash_out) {
|
|
|
|
|
status = CL_ENULLARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
status = cli_hash_type_from_name(hash_alg, &type);
|
|
|
|
|
if (status != CL_SUCCESS) {
|
|
|
|
|
cli_errmsg("cl_fmap_get_hash: Unknown hash algorithm: %s\n", hash_alg);
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (type >= CLI_HASH_AVAIL_TYPES) {
|
|
|
|
|
cli_errmsg("cl_fmap_get_hash: Unsupported hash algorithm: %s\n", hash_alg);
|
|
|
|
|
status = CL_EARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
status = fmap_get_hash((fmap_t *)map, &hash, type);
|
|
|
|
|
if (status != CL_SUCCESS) {
|
|
|
|
|
cli_errmsg("cl_fmap_get_hash: Failed to get hash for algorithm: %s\n", hash_alg);
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hash_len = cli_hash_len(type);
|
|
|
|
|
|
|
|
|
|
/* Convert hash to string */
|
|
|
|
|
hash_string = malloc(hash_len * 2 + 1);
|
|
|
|
|
if (!hash_string) {
|
|
|
|
|
cli_errmsg("cl_fmap_get_hash: Failed to allocate memory for hash string\n");
|
|
|
|
|
status = CL_EMEM;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < hash_len; i++) {
|
|
|
|
|
sprintf(hash_string + i * 2, "%02x", hash[i]);
|
|
|
|
|
}
|
|
|
|
|
hash_string[hash_len * 2] = 0;
|
|
|
|
|
|
2025-09-26 18:26:00 -04:00
|
|
|
|
*hash_out = hash_string;
|
|
|
|
|
hash_string = NULL; /* transfer ownership to *hash_out */
|
|
|
|
|
|
|
|
|
|
status = CL_SUCCESS;
|
libclamav: scan-layer callback API functions
Add the following scan callbacks:
```c
cl_engine_set_scan_callback(engine, &pre_hash_callback, CL_SCAN_CALLBACK_PRE_HASH);
cl_engine_set_scan_callback(engine, &pre_scan_callback, CL_SCAN_CALLBACK_PRE_SCAN);
cl_engine_set_scan_callback(engine, &post_scan_callback, CL_SCAN_CALLBACK_POST_SCAN);
cl_engine_set_scan_callback(engine, &alert_callback, CL_SCAN_CALLBACK_ALERT);
cl_engine_set_scan_callback(engine, &file_type_callback, CL_SCAN_CALLBACK_FILE_TYPE);
```
Each callback may alter scan behavior using the following return codes:
* CL_BREAK
Scan aborted by callback (the rest of the scan is skipped).
This does not mark the file as clean or infected, it just skips the rest of the scan.
* CL_SUCCESS / CL_CLEAN
File scan will continue.
This is different than CL_VERIFIED because it does not affect prior or future alerts.
Return CL_VERIFIED instead if you want to remove prior alerts for this layer and skip
the rest of the scan for this layer.
* CL_VIRUS
This means you don't trust the file. A new alert will be added.
For CL_SCAN_CALLBACK_ALERT: Means you agree with the alert (no extra alert needed).
* CL_VERIFIED
Layer explicitly trusted by the callback and previous alerts removed FOR THIS layer.
You might want to do this if you trust the hash or verified a digital signature.
The rest of the scan will be skipped FOR THIS layer.
For contained files, this does NOT mean that the parent or adjacent layers are trusted.
Each callback is given a pointer to the current scan layer from which
they can get previous layers, can get the the layer's fmap, and then
various attributes of the layer and of the fmap such as:
- layer recursion level
- layer object id
- layer file type
- layer attributes (was decerypted, normalized, embedded, or re-typed)
- layer last alert
- fmap name
- fmap hash (md5, sha1, or sha2-256)
- fmap data (pointer and size)
- fmap file descriptor, if any (fd, offset, size)
- fmap filepath, if any (filepath, offset, size)
To make this possible, this commits introduced a handful of new APIs to
query scan-layer details and fmap details:
- `cl_error_t cl_fmap_set_name(cl_fmap_t *map, const char *name);`
- `cl_error_t cl_fmap_get_name(cl_fmap_t *map, const char **name_out);`
- `cl_error_t cl_fmap_set_path(cl_fmap_t *map, const char *path);`
- `cl_error_t cl_fmap_get_path(cl_fmap_t *map, const char **path_out, size_t *offset_out, size_t *len_out);`
- `cl_error_t cl_fmap_get_fd(const cl_fmap_t *map, int *fd_out, size_t *offset_out, size_t *len_out);`
- `cl_error_t cl_fmap_get_size(const cl_fmap_t *map, size_t *size_out);`
- `cl_error_t cl_fmap_set_hash(const cl_fmap_t *map, const char *hash_alg, char hash);`
- `cl_error_t cl_fmap_have_hash(const cl_fmap_t *map, const char *hash_alg, bool *have_hash_out);`
- `cl_error_t cl_fmap_will_need_hash_later(const cl_fmap_t *map, const char *hash_alg);`
- `cl_error_t cl_fmap_get_hash(const cl_fmap_t *map, const char *hash_alg, const char **hash_out);`
- `cl_error_t cl_fmap_get_data(const cl_fmap_t *map, size_t offset, size_t len, const uint8_t **data_out, size_t *data_len_out);`
- `cl_error_t cl_scan_layer_get_fmap(cl_scan_layer_t *layer, cl_fmap_t **fmap_out);`
- `cl_error_t cl_scan_layer_get_parent_layer(cl_scan_layer_t *layer, cl_scan_layer_t **parent_layer_out);`
- `cl_error_t cl_scan_layer_get_type(cl_scan_layer_t *layer, const char **type_out);`
- `cl_error_t cl_scan_layer_get_recursion_level(cl_scan_layer_t *layer, uint32_t *recursion_level_out);`
- `cl_error_t cl_scan_layer_get_object_id(cl_scan_layer_t *layer, uint64_t *object_id_out);`
- `cl_error_t cl_scan_layer_get_last_alert(cl_scan_layer_t *layer, const char **alert_name_out);`
- `cl_error_t cl_scan_layer_get_attributes(cl_scan_layer_t *layer, uint32_t *attributes_out);`
This commit deprecates but does not remove the existing scan callbacks:
- `void cl_engine_set_clcb_pre_cache(struct cl_engine *engine, clcb_pre_cache callback);`
- `void cl_engine_set_clcb_file_inspection(struct cl_engine *engine, clcb_file_inspection callback);`
- `void cl_engine_set_clcb_pre_scan(struct cl_engine *engine, clcb_pre_scan callback);`
- `void cl_engine_set_clcb_post_scan(struct cl_engine *engine, clcb_post_scan callback);`
- `void cl_engine_set_clcb_virus_found(struct cl_engine *engine, clcb_virus_found callback);`
- `void cl_engine_set_clcb_hash(struct cl_engine *engine, clcb_hash callback);`
This commit also adds an interactive test program to demonstrate the callbacks.
See: `examples/ex_scan_callbacks.c`
CLAM-255
CLAM-2485
CLAM-2626
2025-06-22 14:37:03 -04:00
|
|
|
|
|
|
|
|
|
done:
|
2025-09-26 18:26:00 -04:00
|
|
|
|
if (NULL != hash_string) {
|
|
|
|
|
free(hash_string);
|
|
|
|
|
hash_string = NULL;
|
libclamav: scan-layer callback API functions
Add the following scan callbacks:
```c
cl_engine_set_scan_callback(engine, &pre_hash_callback, CL_SCAN_CALLBACK_PRE_HASH);
cl_engine_set_scan_callback(engine, &pre_scan_callback, CL_SCAN_CALLBACK_PRE_SCAN);
cl_engine_set_scan_callback(engine, &post_scan_callback, CL_SCAN_CALLBACK_POST_SCAN);
cl_engine_set_scan_callback(engine, &alert_callback, CL_SCAN_CALLBACK_ALERT);
cl_engine_set_scan_callback(engine, &file_type_callback, CL_SCAN_CALLBACK_FILE_TYPE);
```
Each callback may alter scan behavior using the following return codes:
* CL_BREAK
Scan aborted by callback (the rest of the scan is skipped).
This does not mark the file as clean or infected, it just skips the rest of the scan.
* CL_SUCCESS / CL_CLEAN
File scan will continue.
This is different than CL_VERIFIED because it does not affect prior or future alerts.
Return CL_VERIFIED instead if you want to remove prior alerts for this layer and skip
the rest of the scan for this layer.
* CL_VIRUS
This means you don't trust the file. A new alert will be added.
For CL_SCAN_CALLBACK_ALERT: Means you agree with the alert (no extra alert needed).
* CL_VERIFIED
Layer explicitly trusted by the callback and previous alerts removed FOR THIS layer.
You might want to do this if you trust the hash or verified a digital signature.
The rest of the scan will be skipped FOR THIS layer.
For contained files, this does NOT mean that the parent or adjacent layers are trusted.
Each callback is given a pointer to the current scan layer from which
they can get previous layers, can get the the layer's fmap, and then
various attributes of the layer and of the fmap such as:
- layer recursion level
- layer object id
- layer file type
- layer attributes (was decerypted, normalized, embedded, or re-typed)
- layer last alert
- fmap name
- fmap hash (md5, sha1, or sha2-256)
- fmap data (pointer and size)
- fmap file descriptor, if any (fd, offset, size)
- fmap filepath, if any (filepath, offset, size)
To make this possible, this commits introduced a handful of new APIs to
query scan-layer details and fmap details:
- `cl_error_t cl_fmap_set_name(cl_fmap_t *map, const char *name);`
- `cl_error_t cl_fmap_get_name(cl_fmap_t *map, const char **name_out);`
- `cl_error_t cl_fmap_set_path(cl_fmap_t *map, const char *path);`
- `cl_error_t cl_fmap_get_path(cl_fmap_t *map, const char **path_out, size_t *offset_out, size_t *len_out);`
- `cl_error_t cl_fmap_get_fd(const cl_fmap_t *map, int *fd_out, size_t *offset_out, size_t *len_out);`
- `cl_error_t cl_fmap_get_size(const cl_fmap_t *map, size_t *size_out);`
- `cl_error_t cl_fmap_set_hash(const cl_fmap_t *map, const char *hash_alg, char hash);`
- `cl_error_t cl_fmap_have_hash(const cl_fmap_t *map, const char *hash_alg, bool *have_hash_out);`
- `cl_error_t cl_fmap_will_need_hash_later(const cl_fmap_t *map, const char *hash_alg);`
- `cl_error_t cl_fmap_get_hash(const cl_fmap_t *map, const char *hash_alg, const char **hash_out);`
- `cl_error_t cl_fmap_get_data(const cl_fmap_t *map, size_t offset, size_t len, const uint8_t **data_out, size_t *data_len_out);`
- `cl_error_t cl_scan_layer_get_fmap(cl_scan_layer_t *layer, cl_fmap_t **fmap_out);`
- `cl_error_t cl_scan_layer_get_parent_layer(cl_scan_layer_t *layer, cl_scan_layer_t **parent_layer_out);`
- `cl_error_t cl_scan_layer_get_type(cl_scan_layer_t *layer, const char **type_out);`
- `cl_error_t cl_scan_layer_get_recursion_level(cl_scan_layer_t *layer, uint32_t *recursion_level_out);`
- `cl_error_t cl_scan_layer_get_object_id(cl_scan_layer_t *layer, uint64_t *object_id_out);`
- `cl_error_t cl_scan_layer_get_last_alert(cl_scan_layer_t *layer, const char **alert_name_out);`
- `cl_error_t cl_scan_layer_get_attributes(cl_scan_layer_t *layer, uint32_t *attributes_out);`
This commit deprecates but does not remove the existing scan callbacks:
- `void cl_engine_set_clcb_pre_cache(struct cl_engine *engine, clcb_pre_cache callback);`
- `void cl_engine_set_clcb_file_inspection(struct cl_engine *engine, clcb_file_inspection callback);`
- `void cl_engine_set_clcb_pre_scan(struct cl_engine *engine, clcb_pre_scan callback);`
- `void cl_engine_set_clcb_post_scan(struct cl_engine *engine, clcb_post_scan callback);`
- `void cl_engine_set_clcb_virus_found(struct cl_engine *engine, clcb_virus_found callback);`
- `void cl_engine_set_clcb_hash(struct cl_engine *engine, clcb_hash callback);`
This commit also adds an interactive test program to demonstrate the callbacks.
See: `examples/ex_scan_callbacks.c`
CLAM-255
CLAM-2485
CLAM-2626
2025-06-22 14:37:03 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
extern cl_error_t cl_fmap_get_data(const cl_fmap_t *map, size_t offset, size_t len, const uint8_t **data_out, size_t *data_len_out)
|
|
|
|
|
{
|
|
|
|
|
cl_error_t status = CL_ERROR;
|
|
|
|
|
const uint8_t *data;
|
|
|
|
|
|
|
|
|
|
if (!map || !data_out) {
|
|
|
|
|
status = CL_ENULLARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (offset > map->len) {
|
|
|
|
|
cli_errmsg("cl_fmap_get_data: Offset %zu is beyond end of file (file length %zu)\n", offset, map->len);
|
|
|
|
|
status = CL_EARG;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (len == 0) {
|
|
|
|
|
// If len is 0, we want to read to the end of the file.
|
|
|
|
|
len = map->len - offset;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (offset + len > map->len) {
|
|
|
|
|
// Adjust len to read only to the end of the file if they asked for too much.
|
|
|
|
|
len = map->len - offset;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
data = fmap_need_off_once_len((fmap_t *)map, offset, len, &len);
|
|
|
|
|
if (!data) {
|
|
|
|
|
cli_errmsg("cl_fmap_get_data: Failed to get data from fmap\n");
|
|
|
|
|
status = CL_EREAD;
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*data_out = data;
|
|
|
|
|
if (data_len_out) {
|
|
|
|
|
*data_len_out = len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
status = CL_SUCCESS;
|
|
|
|
|
|
|
|
|
|
done:
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
extern void cl_fmap_close(cl_fmap_t *map)
|
|
|
|
|
{
|
|
|
|
|
fmap_free(map);
|
|
|
|
|
}
|