clamav/libclamav/regex_list.c

918 lines
29 KiB
C
Raw Normal View History

/*
* Match a string against a list of patterns/regexes.
*
* Copyright (C) 2013-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
* Copyright (C) 2007-2013 Sourcefire, Inc.
*
* Authors: Török Edvin
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
2007-06-30 11:50:56 +00:00
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
#ifdef CL_THREAD_SAFE
#ifndef _REENTRANT
#define _REENTRANT
#endif
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <zlib.h>
#include <limits.h>
#include <sys/types.h>
#include "regex/regex.h"
#include "clamav.h"
#include "others.h"
#include "regex_list.h"
#include "matcher-ac.h"
#include "matcher.h"
#include "str.h"
#include "readdb.h"
#include "jsparse/textbuf.h"
#include "regex_suffix.h"
#include "default.h"
#include "hashtab.h"
#include "mpool.h"
/* Prototypes */
static regex_t *new_preg(struct regex_matcher *matcher);
static size_t reverse_string(char *pattern);
static cl_error_t add_pattern_suffix(void *cbdata, const char *suffix, size_t suffix_len, const struct regex_list *regex);
static cl_error_t add_static_pattern(struct regex_matcher *matcher, char *pattern);
/* ---------- */
2008-10-07 20:04:50 +00:00
#define MATCH_SUCCESS 0
#define MATCH_FAILED -1
/*
* Call this function when an unrecoverable error has occurred, (instead of exit).
*/
static void fatal_error(struct regex_matcher *matcher)
{
regex_list_done(matcher);
matcher->list_inited = -1; /* the phishing module will know we tried to load an allow list, and failed, so it will disable itself too*/
}
static inline char get_char_at_pos_with_skip(const struct pre_fixup_info *info, const char *buffer, size_t pos)
{
const char *str;
size_t realpos = 0;
if (!info) {
return (pos <= strlen(buffer)) ? buffer[pos > 0 ? pos - 1 : 0] : '\0';
}
str = info->pre_displayLink.data;
cli_dbgmsg("calc_pos_with_skip: skip:%llu, %llu - %llu \"%s\",\"%s\"\n", (long long unsigned)pos, (long long unsigned)info->host_start,
2016-02-22 13:26:15 -05:00
(long long unsigned)info->host_end, str, buffer);
pos += info->host_start;
while (str[realpos] && !isalnum(str[realpos])) realpos++;
for (; str[realpos] && (pos > 0); pos--) {
while (str[realpos] == ' ') realpos++;
realpos++;
}
while (str[realpos] == ' ') realpos++;
cli_dbgmsg("calc_pos_with_skip:%s\n", str + realpos);
return (pos > 0 && !str[realpos]) ? '\0' : str[realpos > 0 ? realpos - 1 : 0];
}
static int validate_subdomain(const struct regex_list *regex, const struct pre_fixup_info *pre_fixup, const char *buffer, size_t buffer_len, char *real_url, size_t real_len, char *orig_real_url)
{
char c;
size_t match_len;
if (!regex || !regex->pattern)
return 0;
match_len = strlen(regex->pattern);
if (((c = get_char_at_pos_with_skip(pre_fixup, buffer, buffer_len + 1)) == ' ' || c == '\0' || c == '/' || c == '?') &&
(match_len == buffer_len || /* full match */
(match_len < buffer_len &&
((c = get_char_at_pos_with_skip(pre_fixup, buffer, buffer_len - match_len)) == '.' || (c == ' ')))
/* subdomain matched*/)) {
/* we have an extra / at the end */
if (match_len > 0) match_len--;
cli_dbgmsg("Got a match: %s with %s\n", buffer, regex->pattern);
cli_dbgmsg("Before inserting .: %s\n", orig_real_url);
if (real_len >= match_len + 1) {
const size_t pos = real_len - match_len - 1;
if (real_url[pos] != '.') {
/* we need to shift left, and insert a '.'
* we have an extra '.' at the beginning inserted by get_host to have room,
* orig_real_url has to be used here,
* because we want to overwrite that extra '.' */
size_t orig_real_len = strlen(orig_real_url);
cli_dbgmsg("No dot here:%s\n", real_url + pos);
real_url = orig_real_url;
memmove(real_url, real_url + 1, orig_real_len - match_len - 1);
real_url[orig_real_len - match_len - 1] = '.';
cli_dbgmsg("After inserting .: %s\n", real_url);
}
}
return 1;
}
cli_dbgmsg("Ignoring false match: %s with %s, mismatched character: %c\n", buffer, regex->pattern, c);
return 0;
}
/*
* @matcher - matcher structure to use
* @real_url - href target
* @display_url - <a> tag contents
* @hostOnly - if you want to match only the host part
* @is_allow_list_lookup - is this a lookup in an allow list?
*
* @return - CL_SUCCESS - url doesn't match
* - CL_VIRUS - url matches list
*
* Do not send NULL pointers to this function!!
*
*/
cl_error_t regex_list_match(struct regex_matcher *matcher, char *real_url, const char *display_url, const struct pre_fixup_info *pre_fixup, int hostOnly, const char **info, int is_allow_list_lookup)
{
char *orig_real_url = real_url;
struct regex_list *regex;
size_t real_len, display_len, buffer_len;
2020-07-24 08:32:47 -07:00
char *buffer = NULL;
char *bufrev = NULL;
cl_error_t rc = CL_SUCCESS;
// int filter_search_rc = 0;
int root;
struct cli_ac_data mdata;
struct cli_ac_result *res = NULL;
if (NULL == matcher) {
rc = CL_ENULLARG;
cli_errmsg("regex_list_match: matcher must be initialized\n");
goto done;
}
if (NULL == real_url) {
rc = CL_ENULLARG;
cli_errmsg("regex_list_match: real_url must be initialized\n");
goto done;
}
if (NULL == display_url) {
rc = CL_ENULLARG;
cli_errmsg("regex_list_match: display_url must be initialized\n");
goto done;
}
*info = NULL;
if (1 != matcher->list_inited) {
rc = CL_SUCCESS;
goto done;
}
if (0 == matcher->list_built) {
cli_errmsg("regex_list_match: matcher->list_built must be initialized\n");
rc = CL_ENULLARG;
goto done;
}
/* skip initial '.' inserted by get_host */
if (real_url[0] == '.') real_url++;
if (display_url[0] == '.') display_url++;
real_len = strlen(real_url);
display_len = strlen(display_url);
buffer_len = (hostOnly && !is_allow_list_lookup) ? real_len + 1 : real_len + display_len + 1 + 1;
if (buffer_len < 3) {
/* too short, no match possible */
return CL_SUCCESS;
}
buffer = cli_malloc(buffer_len + 1);
if (!buffer) {
cli_errmsg("regex_list_match: Unable to allocate memory for buffer\n");
return CL_EMEM;
}
strncpy(buffer, real_url, real_len);
buffer[real_len] = (!is_allow_list_lookup && hostOnly) ? '/' : ':';
/*
* For H-type PDB signatures, real_url is actually the DisplayedHostname.
* RealHostname is not used.
*/
if (!hostOnly || is_allow_list_lookup) {
/* For all other PDB and WDB signatures concatenate Real:Displayed. */
strncpy(buffer + real_len + 1, display_url, display_len);
}
buffer[buffer_len - 1] = '/';
buffer[buffer_len] = 0;
cli_dbgmsg("Looking up in regex_list: %s\n", buffer);
if (CL_SUCCESS != (rc = cli_ac_initdata(&mdata, 0, 0, 0, CLI_DEFAULT_AC_TRACKLEN)))
return rc;
bufrev = cli_strdup(buffer);
if (!bufrev)
return CL_EMEM;
reverse_string(bufrev);
// TODO Add this back in once we improve the regex parsing code that finds
// suffixes to add to the filter.
//
// Reviewing Coverity bug reports we found that the return value to this
// filter_search call was effectively being ignored, causing no filtering
// to occur. Fixing this issue resulted in a unit test that uses the
// following match list regex to fail when searching for `ebay.com`.:
//
// .+\\.paypal\\.(com|de|fr|it)([/?].*)?:.+\\.ebay\\.(at|be|ca|ch|co\\.uk|de|es|fr|ie|in|it|nl|ph|pl|com(\\.(au|cn|hk|my|sg))?)/
//
// After investigating further, this is because the regex_list_add_pattern
// call, which parses the regex for suffixes and attempts to add these to
// the filter, can't handle the `com(\\.(au|cn|hk|my|sg))?` portion of
// the regex. As a result, it only adds `ebay.at`, `ebay.be`, `ebay.ca`, up
// through `ebay.pl` into the filter). With the commented out code below
// uncommented, these suffixes not existing in the filter are treated as
// there not being a corresponding regex for ebay.com, causing no regex
// rules to be evaluated against the URL.
//
// We should get the regex parsing code working (and ensure it handles any
// other complex cases in daily.cdb) before re-enabling this code. The code
// has had no effect for 12+ years at this point, though, so it's probably
// safe to wait a bit longer without it.
//
// filter_search_rc = filter_search(&matcher->filter, (const unsigned char *)bufrev, buffer_len);
// if (filter_search_rc == -1) {
// free(buffer);
// free(bufrev);
// /* filter says this suffix doesn't match.
// * The filter has false positives, but no false
// * negatives */
// return CL_SUCCESS;
//}
rc = cli_ac_scanbuff((const unsigned char *)bufrev, buffer_len, NULL, (void *)&regex, &res, &matcher->suffixes, &mdata, 0, 0, NULL, AC_SCAN_VIR, NULL);
free(bufrev);
cli_ac_freedata(&mdata);
rc = CL_SUCCESS;
root = matcher->root_regex_idx;
while (res || root) {
struct cli_ac_result *q;
if (!res) {
regex = matcher->suffix_regexes[root].head;
root = 0;
} else {
regex = res->customdata;
}
while (!rc && regex) {
/* loop over multiple regexes corresponding to
* this suffix */
if (!regex->preg) {
/* we matched a static pattern */
rc = validate_subdomain(regex, pre_fixup, buffer, buffer_len, real_url, real_len, orig_real_url);
} else {
rc = !cli_regexec(regex->preg, buffer, 0, NULL, 0);
}
if (rc) *info = regex->pattern;
regex = regex->nxt;
}
if (res) {
q = res;
res = res->next;
free(q);
}
}
free(buffer);
if (!rc)
cli_dbgmsg("Lookup result: not in regex list\n");
else
cli_dbgmsg("Lookup result: in regex list\n");
done:
return rc;
}
/* Initialization & loading */
/* Initializes @matcher, allocating necessary substructures */
cl_error_t init_regex_list(struct regex_matcher *matcher, uint8_t dconf_prefiltering)
{
#ifdef USE_MPOOL
mpool_t *mp = NULL;
#endif
cl_error_t rc = CL_SUCCESS;
if (NULL == matcher) {
cli_errmsg("init_regex_list: matcher must be initialized\n");
rc = CL_ENULLARG;
goto done;
}
#ifdef USE_MPOOL
mp = matcher->mempool;
if (NULL == mp) {
cli_errmsg("init_regex_list: matcher->mempool must be initialized\n");
rc = CL_ENULLARG;
goto done;
}
#endif
memset(matcher, 0, sizeof(*matcher));
matcher->list_inited = 1;
matcher->list_built = 0;
matcher->list_loaded = 0;
cli_hashtab_init(&matcher->suffix_hash, 512);
#ifdef USE_MPOOL
matcher->mempool = mp;
matcher->suffixes.mempool = mp;
#endif
if ((rc = cli_ac_init(&matcher->suffixes, 2, 32, dconf_prefiltering))) {
goto done;
}
#ifdef USE_MPOOL
matcher->sha256_hashes.mempool = mp;
matcher->hostkey_prefix.mempool = mp;
#endif
if ((rc = cli_bm_init(&matcher->sha256_hashes))) {
goto done;
}
if ((rc = cli_bm_init(&matcher->hostkey_prefix))) {
goto done;
}
filter_init(&matcher->filter);
done:
return rc;
}
static int functionality_level_check(char *line)
{
char *ptmin;
char *ptmax;
size_t j;
ptmin = strrchr(line, ':');
if (!ptmin)
return CL_SUCCESS;
ptmin++;
ptmax = strchr(ptmin, '-');
if (!ptmax)
return CL_SUCCESS; /* there is no functionality level specified, so we're ok */
else {
size_t min, max;
ptmax++;
for (j = 0; j + ptmin + 1 < ptmax; j++)
if (!isdigit(ptmin[j]))
return CL_SUCCESS; /* not numbers, not functionality level */
for (j = 0; j < strlen(ptmax); j++)
if (!isdigit(ptmax[j]))
return CL_SUCCESS; /* see above */
ptmax[-1] = '\0';
min = atoi(ptmin);
if (strlen(ptmax) == 0)
max = INT_MAX;
else
max = atoi(ptmax);
if (min > cl_retflevel()) {
cli_dbgmsg("regex list line %s not loaded (required f-level: %u)\n", line, (unsigned int)min);
return CL_EMALFDB;
}
if (max < cl_retflevel())
return CL_EMALFDB;
ptmin[-1] = '\0';
return CL_SUCCESS;
}
}
static int add_hash(struct regex_matcher *matcher, char *pattern, const char fl, int is_prefix)
{
int rc;
struct cli_bm_patt *pat = MPOOL_CALLOC(matcher->mempool, 1, sizeof(*pat));
struct cli_matcher *bm;
const char *vname = NULL;
if (!pat)
return CL_EMEM;
pat->pattern = (unsigned char *)CLI_MPOOL_HEX2STR(matcher->mempool, pattern);
if (!pat->pattern)
return CL_EMALFDB;
pat->length = 32;
if (is_prefix) {
pat->length = 4;
bm = &matcher->hostkey_prefix;
} else {
bm = &matcher->sha256_hashes;
}
if (!matcher->sha256_pfx_set.keys) {
if ((rc = cli_hashset_init(&matcher->sha256_pfx_set, 1048576, 90))) {
return rc;
}
}
if (fl != 'W' && pat->length == 32 &&
cli_hashset_contains(&matcher->sha256_pfx_set, cli_readint32(pat->pattern)) &&
cli_bm_scanbuff(pat->pattern, 32, &vname, NULL, &matcher->sha256_hashes, 0, NULL, NULL, NULL) == CL_VIRUS) {
if (*vname == 'W') {
/* hash is allowed in local.gdb */
cli_dbgmsg("Skipping hash %s\n", pattern);
MPOOL_FREE(matcher->mempool, pat->pattern);
MPOOL_FREE(matcher->mempool, pat);
return CL_SUCCESS;
}
}
pat->virname = MPOOL_MALLOC(matcher->mempool, 1);
if (!pat->virname) {
free(pat);
cli_errmsg("add_hash: Unable to allocate memory for path->virname\n");
return CL_EMEM;
}
*pat->virname = fl;
cli_hashset_addkey(&matcher->sha256_pfx_set, cli_readint32(pat->pattern));
if ((rc = cli_bm_addpatt(bm, pat, "*"))) {
cli_errmsg("add_hash: failed to add BM pattern\n");
free(pat->pattern);
free(pat->virname);
free(pat);
return CL_EMALFDB;
}
return CL_SUCCESS;
}
/* Load patterns/regexes from file */
cl_error_t load_regex_matcher(struct cl_engine *engine, struct regex_matcher *matcher, FILE *fd, unsigned int *signo, unsigned int options, int is_allow_list_lookup, struct cli_dbio *dbio, uint8_t dconf_prefiltering)
{
cl_error_t rc;
int line = 0, entry = 0;
char buffer[FILEBUFF];
if (NULL == matcher) {
cli_errmsg("load_regex_matcher: matcher must be initialized\n");
return CL_ENULLARG;
}
if (matcher->list_inited == -1)
return CL_EMALFDB; /* already failed to load */
if (!fd && !dbio) {
cli_errmsg("Unable to load regex list (null file)\n");
return CL_ENULLARG;
}
cli_dbgmsg("Loading regex_list\n");
if (!matcher->list_inited) {
rc = init_regex_list(matcher, dconf_prefiltering);
if (!matcher->list_inited) {
cli_errmsg("Regex list failed to initialize!\n");
fatal_error(matcher);
return rc;
}
}
/*
* Regexlist db format, common to .wdb (allow list) and .pdb (domain list) files.
*
* Multiple lines of form, (empty lines are skipped):
* Flags RealURL DisplayedURL
* Where:
* Flags:
*
* .pdb files:
* R - regex, H - host-only, followed by (optional) 3-digit hexnumber representing
* flags that should be filtered.
* [i.e. phishcheck urls.flags that we don't want to be done for this particular host]
*
* .wdb files:
* X - full URL regex
* Y - host-only regex
* M - host simple pattern
*
* If a line in the file doesn't conform to this format, loading fails
*
*/
while (cli_dbgets(buffer, FILEBUFF, fd, dbio)) {
char *pattern;
char *flags;
size_t pattern_len;
cli_chomp(buffer);
line++;
if (!*buffer)
continue; /* skip empty lines */
Fix several coverity warnings 290424 Missing break in switch - In hash_match: Missing break statement between cases in switch statement 290414 Resource leak - In cli_scanishield_msi: Leak of memory or pointers to system resources. Memory leak in a fail case 288197 Resource leak - In decrypt_any: Leak of memory or pointers to system resources. Memory leak in a fail case 290426 Resource leak - In cli_magic_scan: Leak of memory or pointers to system resources. Leaked a file prefix when running with --save-temps 192923 Resource leak - In cli_scanrar: Leak of memory or pointers to system resources. Leaked a file descriptor if a virus was found in a RAR file comment 225146 Resource leak - In cli_scanegg: Leak of memory or pointers to system resources. Leaked a file descriptor if unable to write a comment file to disk 290425 Resource leak - In scan_common: Leak of memory or pointers to system resources. Memory leaks in various fail cases. Also changes cli_scanrar to write out the file comment only if --leave-temps is specified and scan the buffer (like what is done in cli_scanegg) instead of writing the file out, scanning that, and then deleting the file if --leave-temps is not specified. The unit tests stopped working when correcting an issue with a switch statement that determined what type of signature had matched on a Google SafeBrowsing GDB rule. Looking into the unit tests, it looks like the code had always assumed that the test cases would be detected by a malware test rule in unit_tests/input/daily.gdb, but now some of the tests get matched on the phishing test rule. I updated the test logic to be more clear, and added tests for both cases now. Fix some memory leaks in libclamav/scanners.c
2020-07-15 08:39:32 -07:00
if (buffer[0] == '#')
continue;
if (functionality_level_check(buffer))
continue;
if (engine->cb_sigload && engine->cb_sigload("phishing", buffer, ~options & CL_DB_OFFICIAL, engine->cb_sigload_ctx)) {
cli_dbgmsg("load_regex_matcher: skipping %s due to callback\n", buffer);
continue;
}
entry++;
pattern = strchr(buffer, ':');
if (!pattern) {
cli_errmsg("Malformed regex list line %d\n", line);
fatal_error(matcher);
return CL_EMALFDB;
}
/*pattern[0]='\0';*/
flags = buffer + 1;
pattern++;
pattern_len = strlen(pattern);
/* '-3' to leave room for the '/' and null being
* appended below.
*/
if ((pattern - buffer) + pattern_len < (FILEBUFF - 3)) {
pattern[pattern_len] = '/';
pattern[pattern_len + 1] = '\0';
} else {
cli_errmsg("Overlong regex line %d\n", line);
fatal_error(matcher);
return CL_EMALFDB;
}
if ((buffer[0] == 'R' && !is_allow_list_lookup) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_allow_list_lookup)) {
/* regex for hostname*/
if ((rc = regex_list_add_pattern(matcher, pattern))) {
return rc == CL_EMEM ? CL_EMEM : CL_EMALFDB;
}
} else if ((buffer[0] == 'H' && !is_allow_list_lookup) || (buffer[0] == 'M' && is_allow_list_lookup)) {
/*matches displayed host*/
if ((rc = add_static_pattern(matcher, pattern)))
return rc == CL_EMEM ? CL_EMEM : CL_EMALFDB;
} else if (buffer[0] == 'S' && (!is_allow_list_lookup || pattern[0] == 'W')) {
pattern[pattern_len] = '\0';
if (pattern[0] == 'W')
flags[0] = 'W';
if ((pattern[0] == 'W' || pattern[0] == 'F' || pattern[0] == 'P') && pattern[1] == ':') {
pattern += 2;
if ((rc = add_hash(matcher, pattern, flags[0], pattern[-2] == 'P'))) {
cli_errmsg("Error loading at line: %d\n", line);
return rc == CL_EMEM ? CL_EMEM : CL_EMALFDB;
}
} else {
cli_errmsg("Error loading line: %d, %c\n", line, *pattern);
return CL_EMALFDB;
}
} else {
return CL_EMALFDB;
}
}
matcher->list_loaded = 1;
if (signo)
*signo += entry;
return CL_SUCCESS;
}
/* Build the matcher list */
cl_error_t cli_build_regex_list(struct regex_matcher *matcher)
{
cl_error_t rc;
if (!matcher)
return CL_SUCCESS;
if (!matcher->list_inited || !matcher->list_loaded) {
cli_errmsg("Regex list not loaded!\n");
return -1; /*TODO: better error code */
}
cli_dbgmsg("Building regex list\n");
cli_hashtab_free(&matcher->suffix_hash);
if ((rc = cli_ac_buildtrie(&matcher->suffixes)))
return rc;
matcher->list_built = 1;
cli_hashset_destroy(&matcher->sha256_pfx_set);
return CL_SUCCESS;
}
/* Done with this matcher, free resources */
void regex_list_done(struct regex_matcher *matcher)
{
if (NULL == matcher) {
cli_errmsg("regex_list_done: matcher must be initialized\n");
goto done;
}
if (matcher->list_inited == 1) {
size_t i;
cli_ac_free(&matcher->suffixes);
if (matcher->suffix_regexes) {
for (i = 0; i < matcher->suffix_cnt; i++) {
struct regex_list *r = matcher->suffix_regexes[i].head;
while (r) {
struct regex_list *q = r;
r = r->nxt;
free(q->pattern);
free(q);
}
}
free(matcher->suffix_regexes);
matcher->suffix_regexes = NULL;
}
if (matcher->all_pregs) {
for (i = 0; i < matcher->regex_cnt; i++) {
regex_t *r = matcher->all_pregs[i];
cli_regfree(r);
MPOOL_FREE(matcher->mempool, r);
}
MPOOL_FREE(matcher->mempool, matcher->all_pregs);
}
cli_hashtab_free(&matcher->suffix_hash);
cli_bm_free(&matcher->sha256_hashes);
cli_bm_free(&matcher->hostkey_prefix);
}
done:
return;
}
int is_regex_ok(struct regex_matcher *matcher)
{
int ret = 0;
if (NULL == matcher) {
cli_errmsg("is_regex_ok: matcher must be initialized\n");
} else {
ret = (!matcher->list_inited || matcher->list_inited != -1); /* either we don't have a regexlist, or we initialized it successfully */
}
return ret;
}
static cl_error_t add_newsuffix(struct regex_matcher *matcher, struct regex_list *info, const char *suffix, size_t len)
{
struct cli_matcher *root = NULL;
struct cli_ac_patt *new = NULL;
size_t i;
cl_error_t ret = CL_SUCCESS;
if (NULL == matcher) {
cli_errmsg("add_newsuffix: matcher must be initialized\n");
ret = CL_ENULLARG;
goto done;
}
root = &matcher->suffixes;
if (NULL == root) {
cli_errmsg("add_newsuffix: root must be initialized\n");
ret = CL_ENULLARG;
goto done;
}
if (NULL == suffix) {
cli_errmsg("add_newsuffix: suffix must be initialized\n");
ret = CL_ENULLARG;
goto done;
}
new = MPOOL_CALLOC(matcher->mempool, 1, sizeof(*new));
if (!new) {
cli_errmsg("add_newsuffix: Unable to allocate memory for new\n");
ret = CL_EMEM;
goto done;
}
new->rtype = 0;
new->type = 0;
new->sigid = 0;
new->parts = 0;
new->partno = 0;
new->mindist = 0;
new->maxdist = 0;
new->offset_min = CLI_OFF_ANY;
new->length[0] = (uint16_t)len;
new->ch[0] = new->ch[1] |= CLI_MATCH_IGNORE;
if (new->length[0] > root->maxpatlen)
root->maxpatlen = new->length[0];
new->pattern = MPOOL_MALLOC(matcher->mempool, sizeof(new->pattern[0]) * len);
if (!new->pattern) {
cli_errmsg("add_newsuffix: Unable to allocate memory for new->pattern\n");
ret = CL_EMEM;
goto done;
}
for (i = 0; i < len; i++) {
new->pattern[i] = suffix[i]; /*new->pattern is short int* */
}
new->customdata = info;
new->virname = NULL;
if ((ret = cli_ac_addpatt(root, new))) {
goto done;
}
if (filter_add_static(&matcher->filter, (const unsigned char *)suffix, len, "regex") < 0) {
cli_errmsg("add_newsuffix: Unable to add filter\n");
ret = CL_ERROR;
goto done;
}
done:
if (CL_SUCCESS != ret) {
if (NULL != new) {
if (NULL != new->pattern) {
MPOOL_FREE(matcher->mempool, new->pattern);
}
MPOOL_FREE(matcher->mempool, new);
}
}
return ret;
}
#define MODULE "regex_list: "
/* ------ load a regex, determine suffix, determine suffix2regexlist map ---- */
static void list_add_tail(struct regex_list_ht *ht, struct regex_list *regex)
{
if (!ht->head)
ht->head = regex;
if (ht->tail) {
ht->tail->nxt = regex;
}
ht->tail = regex;
}
static cl_error_t add_pattern_suffix(void *cbdata, const char *suffix, size_t suffix_len, const struct regex_list *iregex)
{
struct regex_matcher *matcher = cbdata;
struct regex_list *regex = NULL;
const struct cli_element *el = NULL;
cl_error_t ret = CL_SUCCESS;
if (NULL == matcher) {
cli_errmsg("add_pattern_suffix: matcher must be initialized\n");
ret = CL_ENULLARG;
goto done;
}
if (NULL == suffix) {
cli_errmsg("add_pattern_suffix: suffix must be initialized\n");
ret = CL_ENULLARG;
goto done;
}
if (NULL == iregex) {
cli_errmsg("add_pattern_suffix: iregex must be initialized\n");
ret = CL_ENULLARG;
goto done;
}
CLI_MALLOC(regex, sizeof(*regex),
cli_errmsg("add_pattern_suffix: Unable to allocate memory for regex\n");
ret = CL_EMEM);
if (NULL == iregex->pattern) {
regex->pattern = NULL;
} else {
CLI_STRDUP(iregex->pattern, regex->pattern,
cli_errmsg("add_pattern_suffix: unable to strdup iregex->pattern");
ret = CL_EMEM);
}
regex->preg = iregex->preg;
regex->nxt = NULL;
el = cli_hashtab_find(&matcher->suffix_hash, suffix, suffix_len);
/* TODO: what if suffixes are prefixes of eachother and only one will
* match? */
if (el) {
/* existing suffix */
if ((size_t)el->data >= matcher->suffix_cnt) {
cli_errmsg("add_pattern_suffix: el-> data too large");
ret = CL_ERROR;
goto done;
}
CMake: Add CTest support to match Autotools checks An ENABLE_TESTS CMake option is provided so that users can disable testing if they don't want it. Instructions for how to use this included in the INSTALL.cmake.md file. If you run `ctest`, each testcase will write out a log file to the <build>/unit_tests directory. As with Autotools' make check, the test files are from test/.split and unit_tests/.split files, but for CMake these are generated at build time instead of at test time. On Posix systems, sets the LD_LIBRARY_PATH so that ClamAV-compiled libraries can be loaded when running tests. On Windows systems, CTest will identify and collect all library dependencies and assemble a temporarily install under the build/unit_tests directory so that the libraries can be loaded when running tests. The same feature is used on Windows when using CMake to install to collect all DLL dependencies so that users don't have to install them manually afterwards. Each of the CTest tests are run using a custom wrapper around Python's unittest framework, which is also responsible for finding and inserting valgrind into the valgrind tests on Posix systems. Unlike with Autotools, the CMake CTest Valgrind-tests are enabled by default, if Valgrind can be found. There's no need to set VG=1. CTest's memcheck module is NOT supported, because we use Python to orchestrate our tests. Added a bunch of Windows compatibility changes to the unit tests. These were primarily changing / to PATHSEP and making adjustments to use Win32 C headers and ifdef out the POSIX ones which aren't available on Windows. Also disabled a bunch of tests on Win32 that don't work on Windows, notably the mmap ones and FD-passing (i.e. FILEDES) ones. Add JSON_C_HAVE_INTTYPES_H definition to clamav-config.h to eliminate warnings on Windows where json.h is included after inttypes.h because json-c's inttypes replacement relies on it. This is a it of a hack and may be removed if json-c fixes their inttypes header stuff in the future. Add preprocessor definitions on Windows to disable MSVC warnings about CRT secure and nonstandard functions. While there may be a better solution, this is needed to be able to see other more serious warnings. Add missing file comment block and copyright statement for clamsubmit.c. Also change json-c/json.h include filename to json.h in clamsubmit.c. The directory name is not required. Changed the hash table data integer type from long, which is poorly defined, to size_t -- which is capable of storing a pointer. Fixed a bunch of casts regarding this variable to eliminate warnings. Fixed two bugs causing utf8 encoding unit tests to fail on Windows: - The in_size variable should be the number of bytes, not the character count. This was was causing the SHIFT_JIS (japanese codepage) to UTF8 transcoding test to only transcode half the bytes. - It turns out that the MultiByteToWideChar() API can't transcode UTF16-BE to UTF16-LE. The solution is to just iterate over the buffer and flip the bytes on each uint16_t. This but was causing the UTF16-BE to UTF8 tests to fail. I also split up the utf8 transcoding tests into separate tests so I could see all of the failures instead of just the first one. Added a flags parameter to the unit test function to open testfiles because it turns out that on Windows if a file contains the \r\n it will replace it with just \n if you opened the file as a text file instead of as binary. However, if we open the CBC files as binary, then a bunch of bytecode tests fail. So I've changed the tests to open the CBC files in the bytecode tests as text files and open all other files as binary. Ported the feature tests from shell scripts to Python using a modified version of our QA test-framework, which is largely compatible and will allow us to migrate some QA tests into this repo. I'd like to add GitHub Actions pipelines in the future so that all public PR's get some testing before anyone has to manually review them. The clamd --log option was missing from the help string, though it definitely works. I've added it in this commit. It appears that clamd.c was never clang-format'd, so this commit also reformats clamd.c. Some of the check_clamd tests expected the path returned by clamd to match character for character with original path sent to clamd. However, as we now evaluate real paths before a scan, the path returned by clamd isn't going to match the relative (and possibly symlink-ridden) path passed to clamdscan. I fixed this test by changing the test to search for the basename: <signature> FOUND within the response instead of matching the exact path. Autotools: Link check_clamd with libclamav so we can use our utility functions in check_clamd.c.
2020-08-25 23:14:23 -07:00
list_add_tail(&matcher->suffix_regexes[(size_t)el->data], regex);
} else {
/* new suffix */
size_t n = matcher->suffix_cnt;
el = cli_hashtab_insert(&matcher->suffix_hash, suffix, suffix_len, (cli_element_data)n);
CLI_REALLOC(matcher->suffix_regexes,
(n + 1) * sizeof(*matcher->suffix_regexes),
cli_errmsg("add_pattern_suffix: Unable to reallocate memory for matcher->suffix_regexes\n");
ret = CL_EMEM);
matcher->suffix_regexes[n].tail = regex;
matcher->suffix_regexes[n].head = regex;
if (suffix[0] == '/' && suffix[1] == '\0') {
matcher->root_regex_idx = n;
}
ret = add_newsuffix(matcher, regex, suffix, suffix_len);
if (CL_SUCCESS != ret) {
cli_hashtab_delete(&matcher->suffix_hash, suffix, suffix_len);
/*shrink the size back to what it was.*/
CLI_REALLOC(matcher->suffix_regexes, n * sizeof(*matcher->suffix_regexes));
} else {
matcher->suffix_cnt++;
}
}
done:
if (CL_SUCCESS != ret) {
FREE(regex->pattern);
FREE(regex);
}
return ret;
}
static size_t reverse_string(char *pattern)
{
size_t len = strlen(pattern);
size_t i;
for (i = 0; i < (len / 2); i++) {
char aux = pattern[i];
pattern[i] = pattern[len - i - 1];
pattern[len - i - 1] = aux;
}
return len;
}
static regex_t *new_preg(struct regex_matcher *matcher)
{
regex_t *r;
matcher->all_pregs = MPOOL_REALLOC(matcher->mempool, matcher->all_pregs, ++matcher->regex_cnt * sizeof(*matcher->all_pregs));
if (!matcher->all_pregs) {
cli_errmsg("new_preg: Unable to reallocate memory\n");
return NULL;
}
r = MPOOL_MALLOC(matcher->mempool, sizeof(*r));
if (!r) {
cli_errmsg("new_preg: Unable to allocate memory\n");
return NULL;
}
matcher->all_pregs[matcher->regex_cnt - 1] = r;
return r;
}
static cl_error_t add_static_pattern(struct regex_matcher *matcher, char *pattern)
{
size_t len;
struct regex_list regex;
cl_error_t rc = CL_EMEM;
len = reverse_string(pattern);
regex.nxt = NULL;
CLI_STRDUP(pattern, regex.pattern,
cli_errmsg("add_static_pattern: Cannot allocate memory for regex.pattern\n");
rc = CL_EMEM);
regex.preg = NULL;
rc = add_pattern_suffix(matcher, pattern, len, &regex);
done:
FREE(regex.pattern);
return rc;
}
cl_error_t regex_list_add_pattern(struct regex_matcher *matcher, char *pattern)
{
cl_error_t rc;
regex_t *preg;
size_t len;
/* we only match the host, so remove useless stuff */
const char remove_end[] = "([/?].*)?/";
const char remove_end2[] = "([/?].*)/";
len = strlen(pattern);
if (len > sizeof(remove_end)) {
if (strncmp(&pattern[len - sizeof(remove_end) + 1], remove_end, sizeof(remove_end) - 1) == 0) {
len -= sizeof(remove_end) - 1;
pattern[len++] = '/';
}
}
if (len > sizeof(remove_end2)) {
if (strncmp(&pattern[len - sizeof(remove_end2) + 1], remove_end2, sizeof(remove_end2) - 1) == 0) {
len -= sizeof(remove_end2) - 1;
pattern[len++] = '/';
}
}
pattern[len] = '\0';
preg = new_preg(matcher);
if (!preg)
return CL_EMEM;
rc = cli_regex2suffix(pattern, preg, add_pattern_suffix, (void *)matcher);
if (rc) {
cli_regfree(preg);
}
return rc;
}