clamav/libclamav/matcher.h

/*
 *  Copyright (C) 2013-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
 *  Copyright (C) 2007-2013 Sourcefire, Inc.
 *
 *  Authors: Tomasz Kojm
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 *  MA 02110-1301, USA.
 */

#ifndef __MATCHER_H
#define __MATCHER_H

#include <sys/types.h>

#include "clamav.h"
#include "filetypes.h"
#include "others.h"
#include "execs.h"

struct cli_target_info {
    off_t fsize;
    struct cli_exe_info exeinfo;
    int status; /* 0 == not initialised, 1 == initialised OK, -1 == error */
};

/**
 * Initialize a struct cli_target_info so that it's ready to have its exeinfo
 * populated by the call to cli_targetinfo and/or destroyed by
 * cli_targetinfo_destroy.
 *
 * @param info a pointer to the struct cli_target_info to initialize
 */
void cli_targetinfo_init(struct cli_target_info *info);

/**
 * Free resources associated with a struct cli_target_info initialized
 * via cli_targetinfo_init
 *
 * @param info a pointer to the struct cli_target_info to destroy
 */
void cli_targetinfo_destroy(struct cli_target_info *info);

#include "matcher-ac.h"
#include "matcher-bm.h"
#include "matcher-hash.h"
#include "matcher-pcre.h"
#include "matcher-byte-comp.h"
#include "regex_pcre.h"
#include "fmap.h"
#include "mpool.h"

// clang-format off

#define CLI_MATCH_METADATA    0xff00
#define CLI_MATCH_WILDCARD    0x0f00
#define CLI_MATCH_CHAR        0x0000
#define CLI_MATCH_NOCASE      0x1000
#define CLI_MATCH_IGNORE      0x0100
#define CLI_MATCH_SPECIAL     0x0200
#define CLI_MATCH_NIBBLE_HIGH 0x0300
#define CLI_MATCH_NIBBLE_LOW  0x0400

typedef enum tdb_type {
    CLI_TDB_UINT,
    CLI_TDB_RANGE,
    CLI_TDB_STR,
    CLI_TDB_RANGE2,
    CLI_TDB_FTYPE,
    CLI_TDB_FTYPE_EXPR
} tdb_type_t;

struct cli_lsig_tdb {
    uint32_t       *val, *range;
    char           *str;
    tdb_type_t     cnt[3];
    uint32_t       subsigs;

    const uint32_t *target;
    const uint32_t *engine, *nos, *ep, *filesize;
    const uint32_t *container, *handlertype;
    const uint32_t *intermediates;
    /*
    const uint32_t *sectoff, *sectrva, *sectvsz, *sectraw, *sectrsz,
                   *secturva, *sectuvsz, *secturaw, *sectursz;
    */
    const char     *icongrp1, *icongrp2;
    uint32_t       *macro_ptids;
#ifdef USE_MPOOL
    mpool_t        *mempool;
#else
    void           *_padding_mempool;
#endif
};

// clang-format on

#define CLI_LSIG_FLAG_PRIVATE 0x01

typedef enum lsig_type {
    CLI_LSIG_NORMAL,
    CLI_YARA_NORMAL,
    CLI_YARA_OFFSET
} lsig_type_t;

struct cli_bc;
struct cli_ac_lsig {
    uint32_t id;
    unsigned bc_idx;
    lsig_type_t type;
    uint8_t flag;
    union {
        char *logic;
        uint8_t *code_start;
    } u;
    char *virname;
    struct cli_lsig_tdb tdb;
};

typedef void *fuzzyhashmap_t;

struct cli_matcher {
    unsigned int type;

    /* Extended Boyer-Moore */
    uint8_t *bm_shift;
    struct cli_bm_patt **bm_suffix, **bm_pattab;
    uint32_t *soff, soff_len; /* for PE section sigs */
    uint32_t bm_offmode, bm_patterns, bm_reloff_num, bm_absoff_num;

    /* HASH */
    struct cli_hash_patt hm;
    struct cli_hash_wild hwild;

    /* Extended Aho-Corasick */
    uint32_t ac_partsigs, ac_nodes, ac_lists, ac_patterns, ac_lsigs;
    struct cli_ac_lsig **ac_lsigtable;
    struct cli_ac_node *ac_root, **ac_nodetable;
    struct cli_ac_list **ac_listtable;
    struct cli_ac_patt **ac_pattable;
    struct cli_ac_patt **ac_reloff;
    uint32_t ac_reloff_num, ac_absoff_num;
    uint8_t ac_mindepth, ac_maxdepth;
    struct filter *filter;

    uint16_t maxpatlen;
    uint8_t ac_only;

    /* Perl-Compiled Regular Expressions */
#if HAVE_PCRE
    uint32_t pcre_metas;
    struct cli_pcre_meta **pcre_metatable;
    uint32_t pcre_reloff_num, pcre_absoff_num;
#endif

    /* Byte Compare */
    uint32_t bcomp_metas;
    struct cli_bcomp_meta **bcomp_metatable;

    /* Fuzzy Image Hash */
    fuzzyhashmap_t fuzzy_hashmap;

    /* Bytecode Tracker */
    uint32_t linked_bcs;

    /*Store pointers to malloced trans values so that they can be more easily freed*/
    struct cli_ac_node ***trans_array;
    size_t trans_cnt;
    size_t trans_capacity;

#ifdef USE_MPOOL
    mpool_t *mempool;
#else
    void *_padding_mempool;
#endif
};

struct cli_cdb {
    char *virname;           /* virus name */
    cli_file_t ctype;        /* container type */
    regex_t name;            /* filename regex */
    size_t csize[2];         /* container size (min, max); if csize[0] != csize[1]
                              * then value of 0 makes the field ignored
                              */
    size_t fsizec[2];        /* file size in container */
    size_t fsizer[2];        /* real file size */
    int encrypted;           /* file is encrypted; 2 == ignore */
    unsigned int filepos[2]; /* file position in container */
    int res1;                /* reserved / format specific */
    void *res2;              /* reserved / format specific */

    struct cli_cdb *next;
};

#define CLI_MAX_TARGETS 10 /* maximum filetypes for a specific target */
struct cli_mtarget {
    cli_file_t target[CLI_MAX_TARGETS];
    const char *name;
    uint8_t idx; /* idx of matcher */
    uint8_t ac_only;
    uint8_t enable_prefiltering;
    uint8_t target_count; /* must be synced with non-zero values in the target array */
};

#define CLI_MTARGETS 15
static const struct cli_mtarget cli_mtargets[CLI_MTARGETS] = {
    /* All types for target, name, idx, ac_only, pre-filtering?, # of types */
    {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, "GENERIC", 0, 0, 1, 1},
    {{CL_TYPE_MSEXE, 0, 0, 0, 0, 0, 0, 0, 0, 0}, "PE", 1, 0, 1, 1},
    {{CL_TYPE_MSOLE2, 0, 0, 0, 0, 0, 0, 0, 0, 0}, "OLE2", 2, 1, 0, 1},
    {{CL_TYPE_HTML, 0, 0, 0, 0, 0, 0, 0, 0, 0}, "HTML", 3, 1, 0, 1},
    {{CL_TYPE_MAIL, 0, 0, 0, 0, 0, 0, 0, 0, 0}, "MAIL", 4, 1, 1, 1},
    {{CL_TYPE_GRAPHICS, CL_TYPE_GIF, CL_TYPE_PNG, CL_TYPE_JPEG, CL_TYPE_TIFF, 0, 0, 0, 0, 0}, "GRAPHICS", 5, 1, 0, 5},
    {{CL_TYPE_ELF, 0, 0, 0, 0, 0, 0, 0, 0, 0}, "ELF", 6, 1, 0, 1},
    {{CL_TYPE_TEXT_ASCII, 0, 0, 0, 0, 0, 0, 0, 0, 0}, "ASCII", 7, 1, 1, 1},
    {{CL_TYPE_ERROR, 0, 0, 0, 0, 0, 0, 0, 0, 0}, "NOT USED", 8, 1, 0, 1},
    {{CL_TYPE_MACHO, CL_TYPE_MACHO_UNIBIN, 0, 0, 0, 0, 0, 0, 0, 0}, "MACH-O", 9, 1, 0, 2},
    {{CL_TYPE_PDF, 0, 0, 0, 0, 0, 0, 0, 0, 0}, "PDF", 10, 1, 0, 1},
    {{CL_TYPE_SWF, 0, 0, 0, 0, 0, 0, 0, 0, 0}, "FLASH", 11, 1, 0, 1},
    {{CL_TYPE_JAVA, 0, 0, 0, 0, 0, 0, 0, 0, 0}, "JAVA", 12, 1, 0, 1},
    {{CL_TYPE_INTERNAL, 0, 0, 0, 0, 0, 0, 0, 0, 0}, "INTERNAL", 13, 1, 0, 1},
    {{CL_TYPE_OTHER, 0, 0, 0, 0, 0, 0, 0, 0, 0}, "OTHER", 14, 1, 0, 1}};

// clang-format off

#define CLI_OFF_ANY         0xffffffff
#define CLI_OFF_NONE        0xfffffffe
#define CLI_OFF_ABSOLUTE    1
#define CLI_OFF_EOF_MINUS   2
#define CLI_OFF_EP_PLUS     3
#define CLI_OFF_EP_MINUS    4
#define CLI_OFF_SL_PLUS     5
#define CLI_OFF_SX_PLUS     6
#define CLI_OFF_VERSION     7
#define CLI_OFF_MACRO       8
#define CLI_OFF_SE          9

// clang-format on

/**
 * @brief Non-magic scan matching using a file buffer for input.  Older API
 *
 * This function is lower-level, requiring a call to `cli_exp_eval()` after the
 * match to evaluate logical signatures and yara rules.
 *
 * This function does not perform file type magic identification and does not use
 * the file format scanners.
 *
 * @param buffer            The buffer to be matched.
 * @param length            The length of the buffer or amount of bytets to match.
 * @param offset            Offset into the buffer from which to start matching.
 * @param ctx               The scanning context.
 * @param ftype             If specified, may limit signature matching trie by target type corresponding with the specified CL_TYPE
 * @param[in,out] acdata    A list of pattern maching data structs to contain match results, one for each pattern matching trie.
 * @return cl_error_t
 */
cl_error_t cli_scan_buff(const unsigned char *buffer, uint32_t length, uint32_t offset, cli_ctx *ctx, cli_file_t ftype, struct cli_ac_data **acdata);

/**
 * @brief Non-magic scan matching using a file descriptor for input.
 *
 * This function does not perform file type magic identification and does not use
 * the file format scanners.
 *
 * This function uses the newer cli_scan_fmap() scanning API.
 *
 * @param desc          File descriptor to be used for input
 * @param ctx           The scanning context.
 * @param ftype         If specified, may limit signature matching trie by target type corresponding with the specified CL_TYPE
 * @param ftonly        Boolean indicating if the scan is for file-type detection only.
 * @param[out] ftoffset A list of file type signature matches with their corresponding offsets.
 * @param acmode        Use AC_SCAN_VIR and AC_SCAN_FT to set scanning modes.
 * @param[out] acres    A list of cli_ac_result AC pattern matching results.
 * @param name          (optional) Original name of the file (to set fmap name metadata)
 * @param attributes    Layer attributes for the thing to be scanned.
 * @return cl_error_t
 */
cl_error_t cli_scan_desc(int desc, cli_ctx *ctx, cli_file_t ftype, uint8_t ftonly, struct cli_matched_type **ftoffset,
                         unsigned int acmode, struct cli_ac_result **acres, const char *name, uint32_t attributes);

/**
 * @brief Non-magic scan matching of the current fmap in the scan context.  Newer API.
 *
 * This API will invoke cli_exp_eval() for you.
 *
 * @param ctx           The scanning context.
 * @param ftype         If specified, may limit signature matching trie by target type corresponding with the specified CL_TYPE
 * @param ftonly        Boolean indicating if the scan is for file-type detection only.
 * @param[out] ftoffset A list of file type signature matches with their corresponding offsets.
 * @param acmode        Use AC_SCAN_VIR and AC_SCAN_FT to set scanning modes.
 * @param[out] acres    A list of cli_ac_result AC pattern matching results.
 * @param refhash       MD5 hash of the current file, used to save time creating hashes and to limit scan recursion for the HandlerType logical signature FTM feature.
 * @return cl_error_t
 */
cl_error_t cli_scan_fmap(cli_ctx *ctx, cli_file_t ftype, uint8_t ftonly, struct cli_matched_type **ftoffset, unsigned int acmode, struct cli_ac_result **acres, unsigned char *refhash);

/**
 * @brief Evaluate logical signatures and yara rules given the AC matching results
 * from cli_scan_buff() / matcher_run().
 *
 * @param ctx           The scanning context.
 * @param root          The AC trie root to match with.
 * @param acdata        AC match results for a specific AC trie.
 * @param target_info   File metadata used to evaluate logical sig and yara rule options.
 * @param hash          Reference hash of the current file, used to limit recursion for the HandlerType logical signature FTM feature.
 * @return cl_error_t
 */
cl_error_t cli_exp_eval(cli_ctx *ctx, struct cli_matcher *root, struct cli_ac_data *acdata, struct cli_target_info *target_info, const char *hash);

cl_error_t cli_caloff(const char *offstr, const struct cli_target_info *info, unsigned int target, uint32_t *offdata, uint32_t *offset_min, uint32_t *offset_max);

/**
 * @brief Determine if an alert is a known false positive, using each fmap in the the ctx->container stack to check MD5, SHA1, and SHA256 hashes.
 *
 * @param ctx           The scanning context.
 * @param vname         (Optional) The name of the signature alert.
 * @return cl_error_t   CL_CLEAN If an allow-list hash matches with one of the fmap hashes in the scan recursion stack.
 *                      CL_VIRUS If no allow-list hash matches.
 */
cl_error_t cli_check_fp(cli_ctx *ctx, const char *vname);

cl_error_t cli_matchmeta(cli_ctx *ctx, const char *fname, size_t fsizec, size_t fsizer, int encrypted, unsigned int filepos, int res1, void *res2);

/** Parse the executable headers and, if successful, populate exeinfo
 *
 * If target refers to a supported executable file type, the exe header
 * will be parsed and, if successful, info->status will be set to 1.
 * If parsing the exe header fails, info->status will be set to -1.
 * The caller MUST destroy info via a call to cli_targetinfo_destroy
 * regardless of what info->status is set to.
 *
 * @param info A structure to populate with info from the exe header. This
 *             MUST be initialized via cli_targetinfo_init prior to calling
 * @param target the target executable file type. Possible values are:
 *               - 1 - PE32 / PE32+
 *               - 6 - ELF
 *               - 9 - MachO
 * @param ctx The current scan context
 */
void cli_targetinfo(struct cli_target_info *info, unsigned int target, cli_ctx *ctx);

#endif