clamav/libclamav/jsparse/js-norm.c

1860 lines
53 KiB
C
Raw Normal View History

/*
* Javascript normalizer.
*
2025-02-14 10:24:30 -05:00
* Copyright (C) 2013-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
* Copyright (C) 2008-2013 Sourcefire, Inc.
*
* Authors: Török Edvin
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
#ifdef HAVE_CONFIG_H
#include "clamav-config.h"
#endif
#include <stdio.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
2014-07-01 19:38:01 -04:00
#include "clamav.h"
2009-10-12 23:36:13 +02:00
#include "jsparse/lexglobal.h"
#include "hashtab.h"
#include "others.h"
#include "str.h"
#include "js-norm.h"
#include "jsparse/generated/operators.h"
#include "jsparse/generated/keywords.h"
#include "jsparse/textbuf.h"
/* ----------- tokenizer ---------------- */
enum tokenizer_state {
Initial,
MultilineComment,
SinglelineComment,
Number,
DoubleQString,
SingleQString,
Identifier,
Dummy
};
typedef struct scanner {
struct text_buffer buf;
const char *yytext;
size_t yylen;
const char *in;
size_t insize;
size_t pos;
size_t lastpos;
enum tokenizer_state state;
enum tokenizer_state last_state;
2024-01-09 19:41:17 -05:00
} *yyscan_t;
static int yylex(YYSTYPE *lvalp, yyscan_t);
static void yy_scan_bytes(const char *, size_t, yyscan_t scanner);
static const char *yyget_text(yyscan_t scanner);
static int yyget_leng(yyscan_t scanner);
static int yylex_init(yyscan_t *ptr_yy_globals);
static int yylex_destroy(yyscan_t yyscanner);
/* ----------- tokenizer end ---------------- */
enum fsm_state {
Base,
InsideVar,
InsideInitializer,
WaitFunctionName,
WaitParameterList,
InsideFunctionDecl
};
struct scope {
struct cli_hashtable id_map;
struct scope *parent; /* hierarchy */
struct scope *nxt; /* all scopes kept in a list so we can easily free all of them */
enum fsm_state fsm_state;
int last_token;
unsigned int brackets;
unsigned int blocks;
};
struct tokens {
yystype *data;
size_t cnt;
size_t capacity;
};
/* state for the current JS file being parsed */
struct parser_state {
unsigned long var_uniq;
unsigned long syntax_errors;
struct scope *global;
struct scope *current;
struct scope *list;
yyscan_t scanner;
struct tokens tokens;
unsigned int rec;
};
static struct scope *scope_new(struct parser_state *state)
{
struct scope *parent = state->current;
struct scope *s = calloc(1, sizeof(*s));
if (!s)
return NULL;
if (cli_hashtab_init(&s->id_map, 10) < 0) {
free(s);
return NULL;
}
s->parent = parent;
s->fsm_state = Base;
s->nxt = state->list;
state->list = s;
state->current = s;
return s;
}
static struct scope *scope_done(struct scope *s)
{
struct scope *parent = s->parent;
/* TODO: have a hashtab_destroy */
cli_hashtab_clear(&s->id_map);
free(s->id_map.htable);
free(s);
return parent;
}
/* transitions:
* Base --(VAR)--> InsideVar
* InsideVar --(Identifier)-->InsideInitializer
* InsideVar --(anything_else) --> POP (to Base)
* InsideInitializer --(COMMA)--> POP (to InsideVar)
* InsideInitializer | InsideVar --(SEMICOLON) --> POP (to Base)
* InsideInitializer --(BRACKET_OPEN) --> WaitBrClose
* InsideInitializer --(PAR_OPEN) --> WaitParClose
* WaitBrClose --(BRACKET_OPEN) --> increase depth
* WaitBrClose --(BRACKET_CLOSE) --> POP
* WaitParClose --(PAR_CLOSE) --> POP
* WaitParClose --(PAR_OPEN) --> increase depth
*/
/* Base --(VAR)--> PUSH, to InsideVar
* InsideVar --(Identifier)--> InsideInitializer
* InsideVar --(ELSE)--> POP, inc. syntax_errors
* InsideInitializer --(COMMA)--> POP (to InsideVar)
* --(BRACKET_OPEN)--> inc bracket_counter
* --(PAR_OPEN)--> inc par_counter
* --(BRACKET_CLOSE) --> dec bracket_counter
* --(PAR_CLOSE)--> dec par_counter
* --(VAR)--> PUSH, to InsideVar (if bracket_counter != 0 || par_counter != 0)
* --> POP, to InsideVar, inc. syntax_errors (if bracket_counter == 0 && par_counter == 0)
* POP only allowed if bracket_counter == 0 && par_counter == 0
*
* InsideInitializer acts differently, make it only a flag
* ....................
*
2024-01-19 09:08:36 -08:00
* Pushing, Popping is done when entering / exiting function scopes,
* tracking { and function ( is done by the function scope tracker too.
*
* we only need to track brackets.
*/
/*
* var x = document;
* x.writeln(...);
*
* ^we must not normalize member method names
*/
/*
* Variables are declared at function scope, and their initial value is
* undefined. At the point where the initializer is, and from there on the value
* is defined.
*
* { doesn't introduce a new variable scope, they are in function's scope too
*
* function foo() {
* alert(x); -> x exists, undefined
* var x=5;
* alert(x); -> x exists, =5
* }
*
* vs.
*
* function bar() {
* alert(x);//error, x not declared
* x=5;
* }
*
* vs.
*
* but we can declare variables without var, only valid if we use them after
* assigning.
*
* function foobar() {
* x=5;
* alert(x);//x is defined, value is 5
* }
*
* other examples:
* function foo2() {
* alert(x); -> x exists, undefined
* {
* var x=5; -> x equals to 5
* }
* alert(x); -> x is 5
* }
*
* function foo3() {
* var x=4; -> x exists, equals to 4
* alert(x); -> x exists, equals to 4
* {
* var x=5; -> x equals to 5
* }
* alert(x); -> x is 5
* }
*
* function bar3() {
* //same as foo3
* var x=4;
* alert(x);
* {
* x=5;
* }
* alert(x);
* }
*
*/
static const char *scope_declare(struct scope *s, const char *token, const size_t len, struct parser_state *state)
{
CMake: Add CTest support to match Autotools checks An ENABLE_TESTS CMake option is provided so that users can disable testing if they don't want it. Instructions for how to use this included in the INSTALL.cmake.md file. If you run `ctest`, each testcase will write out a log file to the <build>/unit_tests directory. As with Autotools' make check, the test files are from test/.split and unit_tests/.split files, but for CMake these are generated at build time instead of at test time. On Posix systems, sets the LD_LIBRARY_PATH so that ClamAV-compiled libraries can be loaded when running tests. On Windows systems, CTest will identify and collect all library dependencies and assemble a temporarily install under the build/unit_tests directory so that the libraries can be loaded when running tests. The same feature is used on Windows when using CMake to install to collect all DLL dependencies so that users don't have to install them manually afterwards. Each of the CTest tests are run using a custom wrapper around Python's unittest framework, which is also responsible for finding and inserting valgrind into the valgrind tests on Posix systems. Unlike with Autotools, the CMake CTest Valgrind-tests are enabled by default, if Valgrind can be found. There's no need to set VG=1. CTest's memcheck module is NOT supported, because we use Python to orchestrate our tests. Added a bunch of Windows compatibility changes to the unit tests. These were primarily changing / to PATHSEP and making adjustments to use Win32 C headers and ifdef out the POSIX ones which aren't available on Windows. Also disabled a bunch of tests on Win32 that don't work on Windows, notably the mmap ones and FD-passing (i.e. FILEDES) ones. Add JSON_C_HAVE_INTTYPES_H definition to clamav-config.h to eliminate warnings on Windows where json.h is included after inttypes.h because json-c's inttypes replacement relies on it. This is a it of a hack and may be removed if json-c fixes their inttypes header stuff in the future. Add preprocessor definitions on Windows to disable MSVC warnings about CRT secure and nonstandard functions. While there may be a better solution, this is needed to be able to see other more serious warnings. Add missing file comment block and copyright statement for clamsubmit.c. Also change json-c/json.h include filename to json.h in clamsubmit.c. The directory name is not required. Changed the hash table data integer type from long, which is poorly defined, to size_t -- which is capable of storing a pointer. Fixed a bunch of casts regarding this variable to eliminate warnings. Fixed two bugs causing utf8 encoding unit tests to fail on Windows: - The in_size variable should be the number of bytes, not the character count. This was was causing the SHIFT_JIS (japanese codepage) to UTF8 transcoding test to only transcode half the bytes. - It turns out that the MultiByteToWideChar() API can't transcode UTF16-BE to UTF16-LE. The solution is to just iterate over the buffer and flip the bytes on each uint16_t. This but was causing the UTF16-BE to UTF8 tests to fail. I also split up the utf8 transcoding tests into separate tests so I could see all of the failures instead of just the first one. Added a flags parameter to the unit test function to open testfiles because it turns out that on Windows if a file contains the \r\n it will replace it with just \n if you opened the file as a text file instead of as binary. However, if we open the CBC files as binary, then a bunch of bytecode tests fail. So I've changed the tests to open the CBC files in the bytecode tests as text files and open all other files as binary. Ported the feature tests from shell scripts to Python using a modified version of our QA test-framework, which is largely compatible and will allow us to migrate some QA tests into this repo. I'd like to add GitHub Actions pipelines in the future so that all public PR's get some testing before anyone has to manually review them. The clamd --log option was missing from the help string, though it definitely works. I've added it in this commit. It appears that clamd.c was never clang-format'd, so this commit also reformats clamd.c. Some of the check_clamd tests expected the path returned by clamd to match character for character with original path sent to clamd. However, as we now evaluate real paths before a scan, the path returned by clamd isn't going to match the relative (and possibly symlink-ridden) path passed to clamdscan. I fixed this test by changing the test to search for the basename: <signature> FOUND within the response instead of matching the exact path. Autotools: Link check_clamd with libclamav so we can use our utility functions in check_clamd.c.
2020-08-25 23:14:23 -07:00
const struct cli_element *el = cli_hashtab_insert(&s->id_map, token, len, (const cli_element_data)(state->var_uniq++));
/* cli_hashtab_insert either finds an already existing entry, or allocates a
* new one, we return the allocated string */
return el ? el->key : NULL;
}
static const char *scope_use(struct scope *s, const char *token, const size_t len)
{
const struct cli_element *el = cli_hashtab_find(&s->id_map, token, len);
if (el) {
/* identifier already found in current scope,
* return here to avoid overwriting uniq id */
return el->key;
}
/* identifier not yet in current scope's hashtab, add with ID -1.
* Later if we find a declaration it will automatically assign a uniq ID
* to it. If not, we'll know that we have to push ID == -1 tokens to an
* outer scope.*/
CMake: Add CTest support to match Autotools checks An ENABLE_TESTS CMake option is provided so that users can disable testing if they don't want it. Instructions for how to use this included in the INSTALL.cmake.md file. If you run `ctest`, each testcase will write out a log file to the <build>/unit_tests directory. As with Autotools' make check, the test files are from test/.split and unit_tests/.split files, but for CMake these are generated at build time instead of at test time. On Posix systems, sets the LD_LIBRARY_PATH so that ClamAV-compiled libraries can be loaded when running tests. On Windows systems, CTest will identify and collect all library dependencies and assemble a temporarily install under the build/unit_tests directory so that the libraries can be loaded when running tests. The same feature is used on Windows when using CMake to install to collect all DLL dependencies so that users don't have to install them manually afterwards. Each of the CTest tests are run using a custom wrapper around Python's unittest framework, which is also responsible for finding and inserting valgrind into the valgrind tests on Posix systems. Unlike with Autotools, the CMake CTest Valgrind-tests are enabled by default, if Valgrind can be found. There's no need to set VG=1. CTest's memcheck module is NOT supported, because we use Python to orchestrate our tests. Added a bunch of Windows compatibility changes to the unit tests. These were primarily changing / to PATHSEP and making adjustments to use Win32 C headers and ifdef out the POSIX ones which aren't available on Windows. Also disabled a bunch of tests on Win32 that don't work on Windows, notably the mmap ones and FD-passing (i.e. FILEDES) ones. Add JSON_C_HAVE_INTTYPES_H definition to clamav-config.h to eliminate warnings on Windows where json.h is included after inttypes.h because json-c's inttypes replacement relies on it. This is a it of a hack and may be removed if json-c fixes their inttypes header stuff in the future. Add preprocessor definitions on Windows to disable MSVC warnings about CRT secure and nonstandard functions. While there may be a better solution, this is needed to be able to see other more serious warnings. Add missing file comment block and copyright statement for clamsubmit.c. Also change json-c/json.h include filename to json.h in clamsubmit.c. The directory name is not required. Changed the hash table data integer type from long, which is poorly defined, to size_t -- which is capable of storing a pointer. Fixed a bunch of casts regarding this variable to eliminate warnings. Fixed two bugs causing utf8 encoding unit tests to fail on Windows: - The in_size variable should be the number of bytes, not the character count. This was was causing the SHIFT_JIS (japanese codepage) to UTF8 transcoding test to only transcode half the bytes. - It turns out that the MultiByteToWideChar() API can't transcode UTF16-BE to UTF16-LE. The solution is to just iterate over the buffer and flip the bytes on each uint16_t. This but was causing the UTF16-BE to UTF8 tests to fail. I also split up the utf8 transcoding tests into separate tests so I could see all of the failures instead of just the first one. Added a flags parameter to the unit test function to open testfiles because it turns out that on Windows if a file contains the \r\n it will replace it with just \n if you opened the file as a text file instead of as binary. However, if we open the CBC files as binary, then a bunch of bytecode tests fail. So I've changed the tests to open the CBC files in the bytecode tests as text files and open all other files as binary. Ported the feature tests from shell scripts to Python using a modified version of our QA test-framework, which is largely compatible and will allow us to migrate some QA tests into this repo. I'd like to add GitHub Actions pipelines in the future so that all public PR's get some testing before anyone has to manually review them. The clamd --log option was missing from the help string, though it definitely works. I've added it in this commit. It appears that clamd.c was never clang-format'd, so this commit also reformats clamd.c. Some of the check_clamd tests expected the path returned by clamd to match character for character with original path sent to clamd. However, as we now evaluate real paths before a scan, the path returned by clamd isn't going to match the relative (and possibly symlink-ridden) path passed to clamdscan. I fixed this test by changing the test to search for the basename: <signature> FOUND within the response instead of matching the exact path. Autotools: Link check_clamd with libclamav so we can use our utility functions in check_clamd.c.
2020-08-25 23:14:23 -07:00
el = cli_hashtab_insert(&s->id_map, token, len, (const cli_element_data)-1);
return el ? el->key : NULL;
}
CMake: Add CTest support to match Autotools checks An ENABLE_TESTS CMake option is provided so that users can disable testing if they don't want it. Instructions for how to use this included in the INSTALL.cmake.md file. If you run `ctest`, each testcase will write out a log file to the <build>/unit_tests directory. As with Autotools' make check, the test files are from test/.split and unit_tests/.split files, but for CMake these are generated at build time instead of at test time. On Posix systems, sets the LD_LIBRARY_PATH so that ClamAV-compiled libraries can be loaded when running tests. On Windows systems, CTest will identify and collect all library dependencies and assemble a temporarily install under the build/unit_tests directory so that the libraries can be loaded when running tests. The same feature is used on Windows when using CMake to install to collect all DLL dependencies so that users don't have to install them manually afterwards. Each of the CTest tests are run using a custom wrapper around Python's unittest framework, which is also responsible for finding and inserting valgrind into the valgrind tests on Posix systems. Unlike with Autotools, the CMake CTest Valgrind-tests are enabled by default, if Valgrind can be found. There's no need to set VG=1. CTest's memcheck module is NOT supported, because we use Python to orchestrate our tests. Added a bunch of Windows compatibility changes to the unit tests. These were primarily changing / to PATHSEP and making adjustments to use Win32 C headers and ifdef out the POSIX ones which aren't available on Windows. Also disabled a bunch of tests on Win32 that don't work on Windows, notably the mmap ones and FD-passing (i.e. FILEDES) ones. Add JSON_C_HAVE_INTTYPES_H definition to clamav-config.h to eliminate warnings on Windows where json.h is included after inttypes.h because json-c's inttypes replacement relies on it. This is a it of a hack and may be removed if json-c fixes their inttypes header stuff in the future. Add preprocessor definitions on Windows to disable MSVC warnings about CRT secure and nonstandard functions. While there may be a better solution, this is needed to be able to see other more serious warnings. Add missing file comment block and copyright statement for clamsubmit.c. Also change json-c/json.h include filename to json.h in clamsubmit.c. The directory name is not required. Changed the hash table data integer type from long, which is poorly defined, to size_t -- which is capable of storing a pointer. Fixed a bunch of casts regarding this variable to eliminate warnings. Fixed two bugs causing utf8 encoding unit tests to fail on Windows: - The in_size variable should be the number of bytes, not the character count. This was was causing the SHIFT_JIS (japanese codepage) to UTF8 transcoding test to only transcode half the bytes. - It turns out that the MultiByteToWideChar() API can't transcode UTF16-BE to UTF16-LE. The solution is to just iterate over the buffer and flip the bytes on each uint16_t. This but was causing the UTF16-BE to UTF8 tests to fail. I also split up the utf8 transcoding tests into separate tests so I could see all of the failures instead of just the first one. Added a flags parameter to the unit test function to open testfiles because it turns out that on Windows if a file contains the \r\n it will replace it with just \n if you opened the file as a text file instead of as binary. However, if we open the CBC files as binary, then a bunch of bytecode tests fail. So I've changed the tests to open the CBC files in the bytecode tests as text files and open all other files as binary. Ported the feature tests from shell scripts to Python using a modified version of our QA test-framework, which is largely compatible and will allow us to migrate some QA tests into this repo. I'd like to add GitHub Actions pipelines in the future so that all public PR's get some testing before anyone has to manually review them. The clamd --log option was missing from the help string, though it definitely works. I've added it in this commit. It appears that clamd.c was never clang-format'd, so this commit also reformats clamd.c. Some of the check_clamd tests expected the path returned by clamd to match character for character with original path sent to clamd. However, as we now evaluate real paths before a scan, the path returned by clamd isn't going to match the relative (and possibly symlink-ridden) path passed to clamdscan. I fixed this test by changing the test to search for the basename: <signature> FOUND within the response instead of matching the exact path. Autotools: Link check_clamd with libclamav so we can use our utility functions in check_clamd.c.
2020-08-25 23:14:23 -07:00
static size_t scope_lookup(struct scope *s, const char *token, const size_t len)
{
while (s) {
const struct cli_element *el = cli_hashtab_find(&s->id_map, token, len);
CMake: Add CTest support to match Autotools checks An ENABLE_TESTS CMake option is provided so that users can disable testing if they don't want it. Instructions for how to use this included in the INSTALL.cmake.md file. If you run `ctest`, each testcase will write out a log file to the <build>/unit_tests directory. As with Autotools' make check, the test files are from test/.split and unit_tests/.split files, but for CMake these are generated at build time instead of at test time. On Posix systems, sets the LD_LIBRARY_PATH so that ClamAV-compiled libraries can be loaded when running tests. On Windows systems, CTest will identify and collect all library dependencies and assemble a temporarily install under the build/unit_tests directory so that the libraries can be loaded when running tests. The same feature is used on Windows when using CMake to install to collect all DLL dependencies so that users don't have to install them manually afterwards. Each of the CTest tests are run using a custom wrapper around Python's unittest framework, which is also responsible for finding and inserting valgrind into the valgrind tests on Posix systems. Unlike with Autotools, the CMake CTest Valgrind-tests are enabled by default, if Valgrind can be found. There's no need to set VG=1. CTest's memcheck module is NOT supported, because we use Python to orchestrate our tests. Added a bunch of Windows compatibility changes to the unit tests. These were primarily changing / to PATHSEP and making adjustments to use Win32 C headers and ifdef out the POSIX ones which aren't available on Windows. Also disabled a bunch of tests on Win32 that don't work on Windows, notably the mmap ones and FD-passing (i.e. FILEDES) ones. Add JSON_C_HAVE_INTTYPES_H definition to clamav-config.h to eliminate warnings on Windows where json.h is included after inttypes.h because json-c's inttypes replacement relies on it. This is a it of a hack and may be removed if json-c fixes their inttypes header stuff in the future. Add preprocessor definitions on Windows to disable MSVC warnings about CRT secure and nonstandard functions. While there may be a better solution, this is needed to be able to see other more serious warnings. Add missing file comment block and copyright statement for clamsubmit.c. Also change json-c/json.h include filename to json.h in clamsubmit.c. The directory name is not required. Changed the hash table data integer type from long, which is poorly defined, to size_t -- which is capable of storing a pointer. Fixed a bunch of casts regarding this variable to eliminate warnings. Fixed two bugs causing utf8 encoding unit tests to fail on Windows: - The in_size variable should be the number of bytes, not the character count. This was was causing the SHIFT_JIS (japanese codepage) to UTF8 transcoding test to only transcode half the bytes. - It turns out that the MultiByteToWideChar() API can't transcode UTF16-BE to UTF16-LE. The solution is to just iterate over the buffer and flip the bytes on each uint16_t. This but was causing the UTF16-BE to UTF8 tests to fail. I also split up the utf8 transcoding tests into separate tests so I could see all of the failures instead of just the first one. Added a flags parameter to the unit test function to open testfiles because it turns out that on Windows if a file contains the \r\n it will replace it with just \n if you opened the file as a text file instead of as binary. However, if we open the CBC files as binary, then a bunch of bytecode tests fail. So I've changed the tests to open the CBC files in the bytecode tests as text files and open all other files as binary. Ported the feature tests from shell scripts to Python using a modified version of our QA test-framework, which is largely compatible and will allow us to migrate some QA tests into this repo. I'd like to add GitHub Actions pipelines in the future so that all public PR's get some testing before anyone has to manually review them. The clamd --log option was missing from the help string, though it definitely works. I've added it in this commit. It appears that clamd.c was never clang-format'd, so this commit also reformats clamd.c. Some of the check_clamd tests expected the path returned by clamd to match character for character with original path sent to clamd. However, as we now evaluate real paths before a scan, the path returned by clamd isn't going to match the relative (and possibly symlink-ridden) path passed to clamdscan. I fixed this test by changing the test to search for the basename: <signature> FOUND within the response instead of matching the exact path. Autotools: Link check_clamd with libclamav so we can use our utility functions in check_clamd.c.
2020-08-25 23:14:23 -07:00
if (el && (size_t)el->data != (size_t)-1) {
return (size_t)el->data;
}
/* not found in current scope, try in outer scope */
s = s->parent;
}
CMake: Add CTest support to match Autotools checks An ENABLE_TESTS CMake option is provided so that users can disable testing if they don't want it. Instructions for how to use this included in the INSTALL.cmake.md file. If you run `ctest`, each testcase will write out a log file to the <build>/unit_tests directory. As with Autotools' make check, the test files are from test/.split and unit_tests/.split files, but for CMake these are generated at build time instead of at test time. On Posix systems, sets the LD_LIBRARY_PATH so that ClamAV-compiled libraries can be loaded when running tests. On Windows systems, CTest will identify and collect all library dependencies and assemble a temporarily install under the build/unit_tests directory so that the libraries can be loaded when running tests. The same feature is used on Windows when using CMake to install to collect all DLL dependencies so that users don't have to install them manually afterwards. Each of the CTest tests are run using a custom wrapper around Python's unittest framework, which is also responsible for finding and inserting valgrind into the valgrind tests on Posix systems. Unlike with Autotools, the CMake CTest Valgrind-tests are enabled by default, if Valgrind can be found. There's no need to set VG=1. CTest's memcheck module is NOT supported, because we use Python to orchestrate our tests. Added a bunch of Windows compatibility changes to the unit tests. These were primarily changing / to PATHSEP and making adjustments to use Win32 C headers and ifdef out the POSIX ones which aren't available on Windows. Also disabled a bunch of tests on Win32 that don't work on Windows, notably the mmap ones and FD-passing (i.e. FILEDES) ones. Add JSON_C_HAVE_INTTYPES_H definition to clamav-config.h to eliminate warnings on Windows where json.h is included after inttypes.h because json-c's inttypes replacement relies on it. This is a it of a hack and may be removed if json-c fixes their inttypes header stuff in the future. Add preprocessor definitions on Windows to disable MSVC warnings about CRT secure and nonstandard functions. While there may be a better solution, this is needed to be able to see other more serious warnings. Add missing file comment block and copyright statement for clamsubmit.c. Also change json-c/json.h include filename to json.h in clamsubmit.c. The directory name is not required. Changed the hash table data integer type from long, which is poorly defined, to size_t -- which is capable of storing a pointer. Fixed a bunch of casts regarding this variable to eliminate warnings. Fixed two bugs causing utf8 encoding unit tests to fail on Windows: - The in_size variable should be the number of bytes, not the character count. This was was causing the SHIFT_JIS (japanese codepage) to UTF8 transcoding test to only transcode half the bytes. - It turns out that the MultiByteToWideChar() API can't transcode UTF16-BE to UTF16-LE. The solution is to just iterate over the buffer and flip the bytes on each uint16_t. This but was causing the UTF16-BE to UTF8 tests to fail. I also split up the utf8 transcoding tests into separate tests so I could see all of the failures instead of just the first one. Added a flags parameter to the unit test function to open testfiles because it turns out that on Windows if a file contains the \r\n it will replace it with just \n if you opened the file as a text file instead of as binary. However, if we open the CBC files as binary, then a bunch of bytecode tests fail. So I've changed the tests to open the CBC files in the bytecode tests as text files and open all other files as binary. Ported the feature tests from shell scripts to Python using a modified version of our QA test-framework, which is largely compatible and will allow us to migrate some QA tests into this repo. I'd like to add GitHub Actions pipelines in the future so that all public PR's get some testing before anyone has to manually review them. The clamd --log option was missing from the help string, though it definitely works. I've added it in this commit. It appears that clamd.c was never clang-format'd, so this commit also reformats clamd.c. Some of the check_clamd tests expected the path returned by clamd to match character for character with original path sent to clamd. However, as we now evaluate real paths before a scan, the path returned by clamd isn't going to match the relative (and possibly symlink-ridden) path passed to clamdscan. I fixed this test by changing the test to search for the basename: <signature> FOUND within the response instead of matching the exact path. Autotools: Link check_clamd with libclamav so we can use our utility functions in check_clamd.c.
2020-08-25 23:14:23 -07:00
return (size_t)-1;
}
static cl_error_t tokens_ensure_capacity(struct tokens *tokens, size_t cap)
{
if (tokens->capacity < cap) {
yystype *data;
cap += 1024;
/* Keep old data if OOM */
data = cli_max_realloc(tokens->data, cap * sizeof(*tokens->data));
if (!data)
return CL_EMEM;
tokens->data = data;
tokens->capacity = cap;
}
return CL_SUCCESS;
}
static int add_token(struct parser_state *state, const yystype *token)
{
if (tokens_ensure_capacity(&state->tokens, state->tokens.cnt + 1))
return -1;
state->tokens.data[state->tokens.cnt++] = *token;
return 0;
}
struct buf {
size_t pos;
int outfd;
char buf[65536];
};
static inline cl_error_t buf_outc(char c, struct buf *buf)
{
if (buf->pos >= sizeof(buf->buf)) {
if (write(buf->outfd, buf->buf, sizeof(buf->buf)) != sizeof(buf->buf))
return CL_EWRITE;
buf->pos = 0;
}
buf->buf[buf->pos++] = c;
return CL_SUCCESS;
}
static inline cl_error_t buf_outs(const char *s, struct buf *buf)
{
const size_t buf_len = sizeof(buf->buf);
size_t i;
i = buf->pos;
while (*s) {
while (i < buf_len && *s) {
if (isspace(*s & 0xff))
buf->buf[i++] = ' ';
else
buf->buf[i++] = tolower((unsigned char)(*s));
++s;
}
if (i == buf_len) {
if (write(buf->outfd, buf->buf, buf_len) < 0)
return CL_EWRITE;
i = 0;
}
}
buf->pos = i;
return CL_SUCCESS;
}
static inline void output_space(char last, char current, struct buf *out)
{
if (isalnum(last) && isalnum(current))
buf_outc(' ', out);
}
/* return class of last character */
static char output_token(const yystype *token, struct scope *scope, struct buf *out, char lastchar)
{
char sbuf[128];
const char *s = TOKEN_GET(token, cstring);
/* TODO: use a local buffer, instead of FILE* */
switch (token->type) {
case TOK_StringLiteral:
output_space(lastchar, '"', out);
buf_outc('"', out);
if (s) {
buf_outs(s, out);
}
buf_outc('"', out);
return '\"';
case TOK_NumericInt:
output_space(lastchar, '0', out);
snprintf(sbuf, sizeof(sbuf), "%ld", TOKEN_GET(token, ival));
buf_outs(sbuf, out);
return '0';
case TOK_NumericFloat:
output_space(lastchar, '0', out);
snprintf(sbuf, sizeof(sbuf), "%g", TOKEN_GET(token, dval));
buf_outs(sbuf, out);
return '0';
case TOK_IDENTIFIER_NAME:
output_space(lastchar, 'a', out);
if (s) {
CMake: Add CTest support to match Autotools checks An ENABLE_TESTS CMake option is provided so that users can disable testing if they don't want it. Instructions for how to use this included in the INSTALL.cmake.md file. If you run `ctest`, each testcase will write out a log file to the <build>/unit_tests directory. As with Autotools' make check, the test files are from test/.split and unit_tests/.split files, but for CMake these are generated at build time instead of at test time. On Posix systems, sets the LD_LIBRARY_PATH so that ClamAV-compiled libraries can be loaded when running tests. On Windows systems, CTest will identify and collect all library dependencies and assemble a temporarily install under the build/unit_tests directory so that the libraries can be loaded when running tests. The same feature is used on Windows when using CMake to install to collect all DLL dependencies so that users don't have to install them manually afterwards. Each of the CTest tests are run using a custom wrapper around Python's unittest framework, which is also responsible for finding and inserting valgrind into the valgrind tests on Posix systems. Unlike with Autotools, the CMake CTest Valgrind-tests are enabled by default, if Valgrind can be found. There's no need to set VG=1. CTest's memcheck module is NOT supported, because we use Python to orchestrate our tests. Added a bunch of Windows compatibility changes to the unit tests. These were primarily changing / to PATHSEP and making adjustments to use Win32 C headers and ifdef out the POSIX ones which aren't available on Windows. Also disabled a bunch of tests on Win32 that don't work on Windows, notably the mmap ones and FD-passing (i.e. FILEDES) ones. Add JSON_C_HAVE_INTTYPES_H definition to clamav-config.h to eliminate warnings on Windows where json.h is included after inttypes.h because json-c's inttypes replacement relies on it. This is a it of a hack and may be removed if json-c fixes their inttypes header stuff in the future. Add preprocessor definitions on Windows to disable MSVC warnings about CRT secure and nonstandard functions. While there may be a better solution, this is needed to be able to see other more serious warnings. Add missing file comment block and copyright statement for clamsubmit.c. Also change json-c/json.h include filename to json.h in clamsubmit.c. The directory name is not required. Changed the hash table data integer type from long, which is poorly defined, to size_t -- which is capable of storing a pointer. Fixed a bunch of casts regarding this variable to eliminate warnings. Fixed two bugs causing utf8 encoding unit tests to fail on Windows: - The in_size variable should be the number of bytes, not the character count. This was was causing the SHIFT_JIS (japanese codepage) to UTF8 transcoding test to only transcode half the bytes. - It turns out that the MultiByteToWideChar() API can't transcode UTF16-BE to UTF16-LE. The solution is to just iterate over the buffer and flip the bytes on each uint16_t. This but was causing the UTF16-BE to UTF8 tests to fail. I also split up the utf8 transcoding tests into separate tests so I could see all of the failures instead of just the first one. Added a flags parameter to the unit test function to open testfiles because it turns out that on Windows if a file contains the \r\n it will replace it with just \n if you opened the file as a text file instead of as binary. However, if we open the CBC files as binary, then a bunch of bytecode tests fail. So I've changed the tests to open the CBC files in the bytecode tests as text files and open all other files as binary. Ported the feature tests from shell scripts to Python using a modified version of our QA test-framework, which is largely compatible and will allow us to migrate some QA tests into this repo. I'd like to add GitHub Actions pipelines in the future so that all public PR's get some testing before anyone has to manually review them. The clamd --log option was missing from the help string, though it definitely works. I've added it in this commit. It appears that clamd.c was never clang-format'd, so this commit also reformats clamd.c. Some of the check_clamd tests expected the path returned by clamd to match character for character with original path sent to clamd. However, as we now evaluate real paths before a scan, the path returned by clamd isn't going to match the relative (and possibly symlink-ridden) path passed to clamdscan. I fixed this test by changing the test to search for the basename: <signature> FOUND within the response instead of matching the exact path. Autotools: Link check_clamd with libclamav so we can use our utility functions in check_clamd.c.
2020-08-25 23:14:23 -07:00
size_t id = scope_lookup(scope, s, strlen(s));
if (id == (size_t)-1) {
/* identifier not normalized */
buf_outs(s, out);
} else {
CMake: Add CTest support to match Autotools checks An ENABLE_TESTS CMake option is provided so that users can disable testing if they don't want it. Instructions for how to use this included in the INSTALL.cmake.md file. If you run `ctest`, each testcase will write out a log file to the <build>/unit_tests directory. As with Autotools' make check, the test files are from test/.split and unit_tests/.split files, but for CMake these are generated at build time instead of at test time. On Posix systems, sets the LD_LIBRARY_PATH so that ClamAV-compiled libraries can be loaded when running tests. On Windows systems, CTest will identify and collect all library dependencies and assemble a temporarily install under the build/unit_tests directory so that the libraries can be loaded when running tests. The same feature is used on Windows when using CMake to install to collect all DLL dependencies so that users don't have to install them manually afterwards. Each of the CTest tests are run using a custom wrapper around Python's unittest framework, which is also responsible for finding and inserting valgrind into the valgrind tests on Posix systems. Unlike with Autotools, the CMake CTest Valgrind-tests are enabled by default, if Valgrind can be found. There's no need to set VG=1. CTest's memcheck module is NOT supported, because we use Python to orchestrate our tests. Added a bunch of Windows compatibility changes to the unit tests. These were primarily changing / to PATHSEP and making adjustments to use Win32 C headers and ifdef out the POSIX ones which aren't available on Windows. Also disabled a bunch of tests on Win32 that don't work on Windows, notably the mmap ones and FD-passing (i.e. FILEDES) ones. Add JSON_C_HAVE_INTTYPES_H definition to clamav-config.h to eliminate warnings on Windows where json.h is included after inttypes.h because json-c's inttypes replacement relies on it. This is a it of a hack and may be removed if json-c fixes their inttypes header stuff in the future. Add preprocessor definitions on Windows to disable MSVC warnings about CRT secure and nonstandard functions. While there may be a better solution, this is needed to be able to see other more serious warnings. Add missing file comment block and copyright statement for clamsubmit.c. Also change json-c/json.h include filename to json.h in clamsubmit.c. The directory name is not required. Changed the hash table data integer type from long, which is poorly defined, to size_t -- which is capable of storing a pointer. Fixed a bunch of casts regarding this variable to eliminate warnings. Fixed two bugs causing utf8 encoding unit tests to fail on Windows: - The in_size variable should be the number of bytes, not the character count. This was was causing the SHIFT_JIS (japanese codepage) to UTF8 transcoding test to only transcode half the bytes. - It turns out that the MultiByteToWideChar() API can't transcode UTF16-BE to UTF16-LE. The solution is to just iterate over the buffer and flip the bytes on each uint16_t. This but was causing the UTF16-BE to UTF8 tests to fail. I also split up the utf8 transcoding tests into separate tests so I could see all of the failures instead of just the first one. Added a flags parameter to the unit test function to open testfiles because it turns out that on Windows if a file contains the \r\n it will replace it with just \n if you opened the file as a text file instead of as binary. However, if we open the CBC files as binary, then a bunch of bytecode tests fail. So I've changed the tests to open the CBC files in the bytecode tests as text files and open all other files as binary. Ported the feature tests from shell scripts to Python using a modified version of our QA test-framework, which is largely compatible and will allow us to migrate some QA tests into this repo. I'd like to add GitHub Actions pipelines in the future so that all public PR's get some testing before anyone has to manually review them. The clamd --log option was missing from the help string, though it definitely works. I've added it in this commit. It appears that clamd.c was never clang-format'd, so this commit also reformats clamd.c. Some of the check_clamd tests expected the path returned by clamd to match character for character with original path sent to clamd. However, as we now evaluate real paths before a scan, the path returned by clamd isn't going to match the relative (and possibly symlink-ridden) path passed to clamdscan. I fixed this test by changing the test to search for the basename: <signature> FOUND within the response instead of matching the exact path. Autotools: Link check_clamd with libclamav so we can use our utility functions in check_clamd.c.
2020-08-25 23:14:23 -07:00
snprintf(sbuf, sizeof(sbuf), "n%03zu", id);
buf_outs(sbuf, out);
}
}
return 'a';
case TOK_FUNCTION:
output_space(lastchar, 'a', out);
buf_outs("function", out);
return 'a';
default:
if (s) {
const size_t len = strlen(s);
output_space(lastchar, s[0], out);
buf_outs(s, out);
return len ? s[len - 1] : '\0';
}
return '\0';
}
}
/*
* We can't delete the scope as soon as we see a }, because
* we still need the hashmap from it.
*
* If we would normalize all the identifiers, and output when a scope is closed,
* then it would be impossible to normalize calls to other functions.
*
* So we need to keep all scopes in memory, to do this instead of scope_done, we
* simply just set current = current->parent when a scope is closed.
* We keep a list of all scopes created in parser_state-> When we parsed
* everything, we output everything, and then delete all scopes.
*
* We also need to know where to switch scopes on the second pass, so for
* TOK_FUNCTION types we will use another pointer, that points to the scope
* (added to yystype's union).
*
* We lookup the identifier in the scope (using scope_lookup, it looks in parent
* scopes too), if ID is found then output (n%3d, Id),
* otherwise output the identifier as is.
*
* To make it easier to match sigs, we do a xfrm :
* 'function ID1 (..'. => 'n%3d = function (...'
*/
/*
* we'll add all identifier to the scope's map
* those that are not decl. will have initial ID -1
* if we later see a decl for it in same scope, it'll automatically get a
* correct ID.
*
* When parsing of local scope is done, we take any ID -1 identifiers,
* and push them up one level (careful not to overwrite existing IDs).
*
* it would be nice if the tokens would contain a link to the entry in the
* hashtab, a link that automatically gets updated when the element is moved
* (pushed up). This would prevent subsequent lookups in the map,
* when we want to output the tokens.
* There is no easy way to do that, so we just do another lookup
*
*/
/*
* This actually works, redefining foo:
* function foo() {
* var foo=5; alert(foo);
* }
* So we can't treat function names just as any other identifier?
* We can, because you can no longer call foo, if you redefined it as a var.
* So if we rename both foo-s with same name, it will have same behaviour.
*
* This means that a new scope should begin after function, and not after
* function ... (.
*/
static void scope_free_all(struct scope *p)
{
struct scope *nxt;
do {
nxt = p->nxt;
scope_done(p);
p = nxt;
} while (p);
}
size_t cli_strtokenize(char *buffer, const char delim, const size_t token_count, const char **tokens);
static int match_parameters(const yystype *tokens, size_t num_tokens, const char **param_names, size_t num_param_names)
{
size_t token_idx = 1;
size_t names_idx = 0;
if (tokens[0].type != TOK_PAR_OPEN) {
return -1;
}
if (token_idx >= num_tokens) {
return -1;
}
while (names_idx < num_param_names) {
num_param_names--;
const char *token_val = TOKEN_GET(&tokens[token_idx], cstring);
if (token_val == NULL) {
return -1;
}
if ((token_idx >= num_tokens) ||
(tokens[token_idx].type != TOK_IDENTIFIER_NAME)) {
return -1;
}
token_idx++;
if ((0 != strcmp(token_val, param_names[names_idx]))) {
return -1;
}
names_idx++;
if ((token_idx >= num_tokens) ||
(num_param_names > 0 && tokens[token_idx].type != TOK_COMMA) ||
(num_param_names == 0 && tokens[token_idx].type != TOK_PAR_CLOSE)) {
return -1;
}
token_idx++;
}
return 0;
}
static const char *de_packer_3[] = {"p", "a", "c", "k", "e", "r"};
static const char *de_packer_2[] = {"p", "a", "c", "k", "e", "d"};
static inline char *textbuffer_done(yyscan_t scanner)
{
char *str = cli_max_realloc(scanner->buf.data, scanner->buf.pos);
if (!str) {
str = scanner->buf.data;
}
scanner->yytext = str;
scanner->yylen = scanner->buf.pos - 1;
memset(&scanner->buf, 0, sizeof(scanner->buf));
return str;
}
#define MODULE "JS-Norm: "
static void free_token(yystype *token)
{
if (token->vtype == vtype_string) {
if (NULL != token->val.string) {
free(token->val.string);
token->val.string = NULL;
}
}
}
static cl_error_t replace_token_range(struct tokens *dst, size_t start, size_t end, const struct tokens *with)
{
const size_t len = with ? with->cnt : 0;
size_t i;
cli_dbgmsg(MODULE "Replacing tokens %lu - %lu with %lu tokens\n", (unsigned long)start,
(unsigned long)end, (unsigned long)len);
if (start >= dst->cnt || end > dst->cnt)
return CL_EARG;
for (i = start; i < end; i++) {
free_token(&dst->data[i]);
}
if (tokens_ensure_capacity(dst, dst->cnt - (end - start) + len))
return CL_EMEM;
memmove(&dst->data[start + len], &dst->data[end], (dst->cnt - end) * sizeof(dst->data[0]));
if (with && len > 0) {
memcpy(&dst->data[start], with->data, len * sizeof(dst->data[0]));
}
dst->cnt = dst->cnt - (end - start) + len;
return CL_SUCCESS;
}
static cl_error_t append_tokens(struct tokens *dst, const struct tokens *src)
{
if (!dst || !src)
return CL_ENULLARG;
if (tokens_ensure_capacity(dst, dst->cnt + src->cnt))
return CL_EMEM;
cli_dbgmsg(MODULE "Appending %lu tokens\n", (unsigned long)(src->cnt));
memcpy(&dst->data[dst->cnt], src->data, src->cnt * sizeof(dst->data[0]));
dst->cnt += src->cnt;
return CL_SUCCESS;
}
static void decode_de(yystype *params[], struct text_buffer *txtbuf)
{
const char *p = TOKEN_GET(params[0], cstring);
const long a = TOKEN_GET(params[1], ival);
/*const char *c = params[2];*/
char *k = TOKEN_GET(params[3], string);
/*const char *r = params[5];*/
unsigned val = 0;
unsigned nsplit = 0;
const char *o;
const char **tokens;
if (!p || !k)
return;
for (o = k; *o; o++)
if (*o == '|') nsplit++;
nsplit++;
tokens = malloc(sizeof(char *) * nsplit);
if (!tokens) {
return;
}
cli_strtokenize(k, '|', nsplit, tokens);
do {
while (*p && !isalnum(*p)) {
if (*p == '\\' && (p[1] == '\'' || p[1] == '\"'))
p++;
else
textbuffer_putc(txtbuf, *p++);
}
if (!*p) break;
val = 0;
o = p;
while (*p && isalnum(*p)) {
unsigned x;
unsigned char v = *p++;
/* TODO: use a table here */
if (v >= 'a')
x = 10 + v - 'a';
else if (v >= 'A')
x = 36 + v - 'A';
else
x = v - '0';
val = val * a + x;
}
if (val >= nsplit || !tokens[val] || !tokens[val][0])
while (o != p)
textbuffer_putc(txtbuf, *o++);
else
textbuffer_append(txtbuf, tokens[val]);
} while (*p);
free((void *)tokens);
textbuffer_append(txtbuf, "\0");
}
struct decode_result {
struct text_buffer txtbuf;
size_t pos_begin;
size_t pos_end;
unsigned append : 1; /* 0: tokens are replaced with new token(s),
1: old tokens are deleted, new ones appended at the end */
};
static void handle_de(yystype *tokens, size_t start, const size_t cnt, const char *name, struct decode_result *res)
{
/* find function decl. end */
size_t i, nesting = 1, j;
yystype *parameters[6];
const size_t parameters_cnt = 6;
yystype *first = NULL;
yystype *last = NULL;
for (i = start; i < cnt; i++) {
if (tokens[i].type == TOK_FUNCTION) {
if (TOKEN_GET(&tokens[i], scope))
nesting++;
else
nesting--;
if (!nesting)
break;
}
}
if (nesting)
return;
memset(parameters, 0, sizeof(parameters));
if (name) {
/* find call to function */
for (; i + 2 < cnt; i++) {
const char *token_val = TOKEN_GET(&tokens[i], cstring);
if (tokens[i].type == TOK_IDENTIFIER_NAME &&
token_val &&
!strcmp(name, token_val) &&
tokens[i + 1].type == TOK_PAR_OPEN) {
i += 2;
for (j = 0; j < parameters_cnt && i < cnt; j++) {
parameters[j] = &tokens[i++];
if (j != parameters_cnt - 1)
while (i < cnt && tokens[i].type != TOK_COMMA) i++;
else
while (i < cnt && tokens[i].type != TOK_PAR_CLOSE) i++;
i++;
}
if (j == parameters_cnt) {
if (NULL == first) {
first = parameters[0];
}
last = parameters[parameters_cnt - 1];
decode_de(parameters, &res->txtbuf);
}
}
}
} else {
while (i < cnt && tokens[i].type != TOK_PAR_OPEN) i++;
++i;
if (i >= cnt) return;
/* TODO: move this v to another func */
for (j = 0; j < parameters_cnt && i < cnt; j++) {
parameters[j] = &tokens[i++];
if (j != parameters_cnt - 1)
while (i < cnt && tokens[i].type != TOK_COMMA) i++;
else
while (i < cnt && tokens[i].type != TOK_PAR_CLOSE) i++;
i++;
}
if (j == parameters_cnt) {
if (NULL == first) {
first = parameters[0];
}
last = parameters[parameters_cnt - 1];
decode_de(parameters, &res->txtbuf);
}
}
if (first && last) {
res->pos_begin = first - tokens;
res->pos_end = last - tokens + 1;
if (res->pos_end + 2 < cnt &&
tokens[res->pos_end].type == TOK_BRACKET_OPEN &&
tokens[res->pos_end + 1].type == TOK_BRACKET_CLOSE &&
tokens[res->pos_end + 2].type == TOK_PAR_CLOSE) {
res->pos_end += 3; /* {}) */
} else if (res->pos_end < cnt) {
/* ) */
res->pos_end++;
}
}
}
static cl_error_t handle_unescape(struct tokens *tokens, size_t start)
{
cl_error_t retval;
if (tokens->data[start].type == TOK_StringLiteral) {
char *R;
struct tokens new_tokens;
yystype tok;
R = cli_unescape(TOKEN_GET(&tokens->data[start], cstring));
tok.type = TOK_StringLiteral;
TOKEN_SET(&tok, string, R);
new_tokens.capacity = new_tokens.cnt = 1;
new_tokens.data = &tok;
if (CL_SUCCESS != (retval = replace_token_range(tokens, start - 2, start + 2, &new_tokens))) {
if (retval == CL_EARG) {
size_t i;
cli_dbgmsg(MODULE "replace_token_range failed.\n");
for (i = 0; i < new_tokens.cnt; i++) {
free_token(&(new_tokens.data[i]));
}
}
return CL_EMEM;
}
}
return CL_SUCCESS;
}
/* scriptasylum dot com's JS encoder */
static void handle_df(const yystype *tokens, size_t start, struct decode_result *res)
{
char *str, *s1;
size_t len, s1_len, i;
unsigned char clast;
char *R;
if (tokens[start].type != TOK_StringLiteral)
return;
str = TOKEN_GET(&tokens[start], string);
if (!str)
return;
len = strlen(str);
if (!len)
return;
clast = str[len - 1] - '0';
str[len - 1] = '\0';
s1 = cli_unescape(str);
s1_len = strlen(s1);
for (i = 0; i < s1_len; i++) {
s1[i] -= clast;
}
R = cli_unescape(s1);
free(s1);
res->pos_begin = start - 2;
res->pos_end = start + 2;
res->txtbuf.data = R;
res->txtbuf.pos = strlen(R);
res->append = 1;
}
static void handle_eval(struct tokens *tokens, size_t start, struct decode_result *res)
{
res->txtbuf.data = TOKEN_GET(&tokens->data[start], string);
if (start + 1 >= tokens->cnt)
return;
if (res->txtbuf.data && tokens->data[start + 1].type == TOK_PAR_CLOSE) {
TOKEN_SET(&tokens->data[start], string, NULL);
res->txtbuf.pos = strlen(res->txtbuf.data);
res->pos_begin = start - 2;
res->pos_end = start + 2;
}
}
static void run_folders(struct tokens *tokens)
{
size_t i;
for (i = 0; i < tokens->cnt; i++) {
const char *cstring = TOKEN_GET(&tokens->data[i], cstring);
if (i + 2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME &&
cstring &&
!strcmp("unescape", cstring) && tokens->data[i + 1].type == TOK_PAR_OPEN) {
handle_unescape(tokens, i + 2);
}
}
}
static inline int state_update_scope(struct parser_state *state, const yystype *token)
{
if (token->type == TOK_FUNCTION) {
struct scope *scope = TOKEN_GET(token, scope);
if (scope) {
state->current = scope;
} else {
/* dummy token marking function end */
if (state->current->parent)
state->current = state->current->parent;
/* don't output this token, it is just a dummy marker */
return 0;
}
}
return 1;
}
static void run_decoders(struct parser_state *state)
{
size_t i;
const char *name;
struct tokens *tokens = &state->tokens;
for (i = 0; i < tokens->cnt; i++) {
const char *cstring = TOKEN_GET(&tokens->data[i], cstring);
struct decode_result res;
memset(&(res.txtbuf), 0, sizeof(res.txtbuf));
res.pos_begin = res.pos_end = 0;
res.append = 0;
if (tokens->data[i].type == TOK_FUNCTION && i + 13 < tokens->cnt) {
name = NULL;
++i;
if (tokens->data[i].type == TOK_IDENTIFIER_NAME) {
cstring = TOKEN_GET(&tokens->data[i], cstring);
name = cstring;
++i;
}
if (-1 != match_parameters(&tokens->data[i], tokens->cnt, de_packer_3, sizeof(de_packer_3) / sizeof(de_packer_3[0])) ||
-1 != match_parameters(&tokens->data[i], tokens->cnt, de_packer_2, sizeof(de_packer_2) / sizeof(de_packer_2[0]))) {
/* find function decl. end */
handle_de(tokens->data, i, tokens->cnt, name, &res);
}
} else if (i + 2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME &&
cstring &&
!strcmp("dF", cstring) && tokens->data[i + 1].type == TOK_PAR_OPEN) {
/* TODO: also match signature of dF function (possibly
* declared using unescape */
handle_df(tokens->data, i + 2, &res);
} else if (i + 2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME &&
cstring &&
!strcmp("eval", cstring) && tokens->data[i + 1].type == TOK_PAR_OPEN) {
handle_eval(tokens, i + 2, &res);
}
if (res.pos_end > res.pos_begin) {
struct tokens parent_tokens;
if (res.pos_end < tokens->cnt && tokens->data[res.pos_end].type == TOK_SEMICOLON)
res.pos_end++;
parent_tokens = state->tokens; /* save current tokens */
/* initialize embedded context */
memset(&state->tokens, 0, sizeof(state->tokens));
if (++state->rec > 16)
cli_dbgmsg(MODULE "recursion limit reached\n");
else {
cli_js_process_buffer(state, res.txtbuf.data, res.txtbuf.pos);
--state->rec;
}
CLI_FREE_AND_SET_NULL(res.txtbuf.data);
/* state->tokens still refers to the embedded/nested context here */
if (!res.append) {
if (CL_EARG == replace_token_range(&parent_tokens, res.pos_begin, res.pos_end, &state->tokens)) {
size_t j;
cli_dbgmsg(MODULE "replace_token_range failed.\n");
for (j = 0; j < state->tokens.cnt; j++) {
free_token(&(state->tokens.data[j]));
}
}
} else {
/* delete tokens */
replace_token_range(&parent_tokens, res.pos_begin, res.pos_end, NULL);
append_tokens(&parent_tokens, &state->tokens);
}
/* end of embedded context, restore tokens state */
free(state->tokens.data);
state->tokens = parent_tokens;
}
state_update_scope(state, &state->tokens.data[i]);
}
}
void cli_js_parse_done(struct parser_state *state)
{
struct tokens *tokens = &state->tokens;
size_t par_balance = 0, i;
char end = '\0';
YYSTYPE val;
cli_dbgmsg(MODULE "in cli_js_parse_done()\n");
/* close unfinished token */
switch (state->scanner->state) {
case DoubleQString:
end = '"';
break;
case SingleQString:
end = '\'';
break;
default: /* make gcc happy */
break;
}
if (end != '\0')
cli_js_process_buffer(state, &end, 1);
/* close remaining parenthesis */
for (i = 0; i < tokens->cnt; i++) {
if (tokens->data[i].type == TOK_PAR_OPEN)
par_balance++;
else if (tokens->data[i].type == TOK_PAR_CLOSE && par_balance > 0)
par_balance--;
}
if (par_balance > 0) {
memset(&val, 0, sizeof(val));
val.type = TOK_PAR_CLOSE;
TOKEN_SET(&val, cstring, ")");
while (par_balance-- > 0) {
add_token(state, &val);
}
}
/* we had to close unfinished strings, parenthesis,
* so that the folders/decoders can run properly */
run_folders(&state->tokens);
run_decoders(state);
yylex_destroy(state->scanner);
state->scanner = NULL;
}
void cli_js_output(struct parser_state *state, const char *tempdir)
{
unsigned i;
struct buf buf;
char lastchar = '\0';
char filename[1024];
snprintf(filename, 1024, "%s" PATHSEP "javascript", tempdir);
buf.pos = 0;
buf.outfd = open(filename, O_CREAT | O_WRONLY | O_BINARY, 0600);
if (buf.outfd < 0) {
cli_errmsg(MODULE "cannot open output file for writing: %s\n", filename);
return;
}
/* append to file */
if (lseek(buf.outfd, 0, SEEK_END) != 0) {
/* separate multiple scripts with \n */
buf_outc('\n', &buf);
}
buf_outs("<script>", &buf);
state->current = state->global;
for (i = 0; i < state->tokens.cnt; i++) {
if (state_update_scope(state, &state->tokens.data[i]))
lastchar = output_token(&state->tokens.data[i], state->current, &buf, lastchar);
}
/* add /script if not already there */
if (buf.pos < 9 || memcmp(buf.buf + buf.pos - 9, "</script>", 9))
buf_outs("</script>", &buf);
if (write(buf.outfd, buf.buf, buf.pos) < 0) {
cli_dbgmsg(MODULE "I/O error\n");
}
close(buf.outfd);
cli_dbgmsg(MODULE "dumped/appended normalized script to: %s\n", filename);
}
void cli_js_destroy(struct parser_state *state)
{
size_t i;
if (!state)
return;
scope_free_all(state->list);
for (i = 0; i < state->tokens.cnt; i++) {
free_token(&state->tokens.data[i]);
}
free(state->tokens.data);
/* detect use after free */
if (state->scanner)
yylex_destroy(state->scanner);
memset(state, 0x55, sizeof(*state));
free(state);
cli_dbgmsg(MODULE "cli_js_destroy() done\n");
}
/* buffer is html-normlike "chunk", if original file is bigger than buffer,
* we rewind to a space, so we'll know that tokens won't be broken in half at
* the end of a buffer. All tokens except string-literals of course.
* So we can assume that after the buffer there is either a space, EOF, or a
* chunk of text not containing whitespace at all (for which we care only if its
* a stringliteral)*/
void cli_js_process_buffer(struct parser_state *state, const char *buf, size_t n)
{
struct scope *current = state->current;
YYSTYPE val = {0};
int yv;
if (!state->global) {
/* this state has either not been initialized,
* or cli_js_parse_done() was already called on it */
cli_warnmsg(MODULE "invalid state\n");
return;
}
yy_scan_bytes(buf, n, state->scanner);
val.vtype = vtype_undefined;
/* on EOF yylex will return 0 */
while ((yv = yylex(&val, state->scanner)) != 0) {
const char *text;
size_t leng;
val.type = yv;
switch (yv) {
case TOK_VAR:
current->fsm_state = InsideVar;
break;
case TOK_IDENTIFIER_NAME:
text = yyget_text(state->scanner);
leng = yyget_leng(state->scanner);
if (current->last_token == TOK_DOT) {
/* this is a member name, don't normalize
*/
TOKEN_SET(&val, string, cli_safer_strdup(text));
val.type = TOK_UNNORM_IDENTIFIER;
} else {
switch (current->fsm_state) {
case WaitParameterList:
state->syntax_errors++;
/* fall through */
case Base:
case InsideInitializer:
TOKEN_SET(&val, cstring, scope_use(current, text, leng));
break;
case InsideVar:
case InsideFunctionDecl:
TOKEN_SET(&val, cstring, scope_declare(current, text, leng, state));
current->fsm_state = InsideInitializer;
current->brackets = 0;
break;
case WaitFunctionName:
TOKEN_SET(&val, cstring, scope_declare(current, text, leng, state));
current->fsm_state = WaitParameterList;
break;
}
}
break;
case TOK_PAR_OPEN:
switch (current->fsm_state) {
case WaitFunctionName:
/* fallthrough */
case WaitParameterList:
current->fsm_state = InsideFunctionDecl;
break;
default:
/* noop */
break;
}
break;
case TOK_PAR_CLOSE:
switch (current->fsm_state) {
case WaitFunctionName:
state->syntax_errors++;
break;
case WaitParameterList:
current->fsm_state = Base;
break;
default:
/* noop */
break;
}
break;
case TOK_CURLY_BRACE_OPEN:
switch (current->fsm_state) {
case WaitFunctionName:
/* fallthrough */
case WaitParameterList:
case InsideFunctionDecl:
/* in a syntactically correct
* file, we would already be in
* the Base state when we see a {
*/
current->fsm_state = Base;
/* fall-through */
case InsideVar:
case InsideInitializer:
state->syntax_errors++;
/* fall-through */
case Base:
default:
current->blocks++;
break;
}
break;
case TOK_CURLY_BRACE_CLOSE:
if (current->blocks > 0)
current->blocks--;
else
state->syntax_errors++;
if (!current->blocks) {
if (current->parent) {
/* add dummy FUNCTION token to
* mark function end */
TOKEN_SET(&val, cstring, "}");
add_token(state, &val);
TOKEN_SET(&val, scope, NULL);
val.type = TOK_FUNCTION;
state->current = current = current->parent;
} else {
/* extra } */
state->syntax_errors++;
}
}
break;
case TOK_BRACKET_OPEN:
current->brackets++;
break;
case TOK_BRACKET_CLOSE:
if (current->brackets > 0)
current->brackets--;
else
state->syntax_errors++;
break;
case TOK_COMMA:
if (current->fsm_state == InsideInitializer && current->brackets == 0 && current->blocks == 0) {
/* initializer ended only if we
* encountered a comma, and [] are
* balanced.
* This avoids switching state on:
* var x = [4,y,u];*/
current->fsm_state = InsideVar;
}
break;
case TOK_SEMICOLON:
if (current->brackets == 0 && current->blocks == 0) {
/* avoid switching state on unbalanced []:
* var x = [test;testi]; */
current->fsm_state = Base;
}
break;
case TOK_FUNCTION:
current = scope_new(state);
current->fsm_state = WaitFunctionName;
TOKEN_SET(&val, scope, state->current);
break;
case TOK_StringLiteral:
if (state->tokens.cnt > 1 && state->tokens.data[state->tokens.cnt - 1].type == TOK_PLUS) {
/* see if can fold */
yystype *prev_string = &state->tokens.data[state->tokens.cnt - 2];
if (prev_string->type == TOK_StringLiteral) {
char *str = TOKEN_GET(prev_string, string);
size_t str_len = strlen(str);
text = yyget_text(state->scanner);
leng = yyget_leng(state->scanner);
/* delete TOK_PLUS */
free_token(&state->tokens.data[--state->tokens.cnt]);
str = cli_max_realloc(str, str_len + leng + 1);
if (!str)
break;
strncpy(str + str_len, text, leng);
str[str_len + leng] = '\0';
TOKEN_SET(prev_string, string, str);
free(val.val.string);
memset(&val, 0, sizeof(val));
val.vtype = vtype_undefined;
continue;
}
}
break;
}
if (val.vtype == vtype_undefined) {
text = yyget_text(state->scanner);
TOKEN_SET(&val, string, cli_safer_strdup(text));
abort();
}
add_token(state, &val);
current->last_token = yv;
memset(&val, 0, sizeof(val));
val.vtype = vtype_undefined;
}
}
struct parser_state *cli_js_init(void)
{
struct parser_state *state = calloc(1, sizeof(*state));
if (!state)
return NULL;
if (!scope_new(state)) {
free(state);
return NULL;
}
state->global = state->current;
if (yylex_init(&state->scanner)) {
scope_done(state->global);
free(state);
return NULL;
}
cli_dbgmsg(MODULE "cli_js_init() done\n");
return state;
}
/*-------------- tokenizer ---------------------*/
enum char_class {
Whitespace,
Slash,
Operator,
DQuote,
SQuote,
Digit,
IdStart,
BracketOpen = TOK_BRACKET_OPEN,
BracketClose = TOK_BRACKET_CLOSE,
Comma = TOK_COMMA,
CurlyOpen = TOK_CURLY_BRACE_OPEN,
CurlyClose = TOK_CURLY_BRACE_CLOSE,
ParOpen = TOK_PAR_OPEN,
ParClose = TOK_PAR_CLOSE,
Dot = TOK_DOT,
SemiColon = TOK_SEMICOLON,
Nop
};
#define SL Slash
#define DG Digit
#define DQ DQuote
#define SQ SQuote
#define ID IdStart
#define OP Operator
#define WS Whitespace
#define BO BracketOpen
#define BC BracketClose
#define CM Comma
#define CO CurlyOpen
#define CC CurlyClose
#define PO ParOpen
#define PC ParClose
#define DT Dot
#define SC SemiColon
#define NA Nop
static const enum char_class ctype[256] = {
NA, NA, NA, NA, NA, NA, NA, NA, NA, WS, WS, WS, NA, WS, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
WS, OP, DQ, NA, ID, OP, OP, SQ, PO, PC, OP, OP, CM, OP, DT, SL,
DG, DG, DG, DG, DG, DG, DG, DG, DG, DG, OP, SC, OP, OP, OP, OP,
NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID,
ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, BO, ID, BC, OP, ID,
NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID,
ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, CO, OP, CC, OP, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA};
static const enum char_class id_ctype[256] = {
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
ID,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
NA,
OP,
NA,
NA,
ID,
NA,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
ID,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
NA,
};
#define CASE_SPECIAL_CHAR(C, S) \
case C: \
TOKEN_SET(lvalp, cstring, (S)); \
return cClass;
#define BUF_KEEP_SIZE 32768
static void textbuf_clean(struct text_buffer *buf)
{
if (buf->capacity > BUF_KEEP_SIZE) {
char *data = cli_max_realloc(buf->data, BUF_KEEP_SIZE);
if (data)
buf->data = data;
buf->capacity = BUF_KEEP_SIZE;
}
buf->pos = 0;
}
static inline int parseString(YYSTYPE *lvalp, yyscan_t scanner, const char q,
enum tokenizer_state tostate)
{
size_t len;
/* look for " terminating the string */
const char *start = &scanner->in[scanner->pos], *end = start;
do {
const size_t siz = &scanner->in[scanner->insize] - end;
end = memchr(end, q, siz);
if (end && end > start && end[-1] == '\\') {
++end;
continue;
}
break;
} while (1);
if (end && end >= start)
len = end - start;
else
len = scanner->insize - scanner->pos;
cli_textbuffer_append_normalize(&scanner->buf, start, len);
if (end) {
char *str;
/* skip over end quote */
scanner->pos += len + 1;
textbuffer_putc(&scanner->buf, '\0');
str = textbuffer_done(scanner);
if (str) {
TOKEN_SET(lvalp, string, str);
} else {
TOKEN_SET(lvalp, cstring, "");
}
scanner->state = Initial;
assert(lvalp->val.string);
return TOK_StringLiteral;
} else {
scanner->pos += len;
/* unfinished string */
scanner->state = tostate;
return 0;
}
}
static inline int parseDQString(YYSTYPE *lvalp, yyscan_t scanner)
{
return parseString(lvalp, scanner, '"', DoubleQString);
}
static inline int parseSQString(YYSTYPE *lvalp, yyscan_t scanner)
{
return parseString(lvalp, scanner, '\'', SingleQString);
}
static inline int parseNumber(YYSTYPE *lvalp, yyscan_t scanner)
{
const unsigned char *in = (const unsigned char *)scanner->in;
int is_float = 0;
while (scanner->pos < scanner->insize) {
unsigned char c = in[scanner->pos++];
if (isdigit(c)) {
textbuffer_putc(&scanner->buf, c);
continue;
}
if (c == '.' && !is_float) {
is_float = 1;
textbuffer_putc(&scanner->buf, '.');
continue;
}
if ((c == 'e' || c == 'E') && is_float) {
textbuffer_putc(&scanner->buf, c);
if (scanner->pos < scanner->insize) {
c = in[scanner->pos++];
if (c == '+' || c == '-' || isdigit(c)) {
textbuffer_putc(&scanner->buf, c);
continue;
}
}
}
scanner->pos--;
textbuffer_putc(&scanner->buf, '\0');
scanner->state = Initial;
if (!scanner->buf.data)
return 0;
if (is_float) {
TOKEN_SET(lvalp, dval, atof(scanner->buf.data));
return TOK_NumericFloat;
} else {
TOKEN_SET(lvalp, ival, atoi(scanner->buf.data));
return TOK_NumericInt;
}
}
scanner->state = Number;
return 0;
}
static inline int parseId(YYSTYPE *lvalp, yyscan_t scanner)
{
const struct keyword *kw;
const unsigned char *in = (const unsigned char *)scanner->in;
scanner->state = Initial;
while (scanner->pos < scanner->insize) {
unsigned char c = in[scanner->pos++];
enum char_class cClass = id_ctype[c];
switch (cClass) {
case IdStart:
textbuffer_putc(&scanner->buf, c);
break;
case Operator:
/* the table contains OP only for \ */
assert(c == '\\');
if (scanner->pos < scanner->insize &&
in[scanner->pos++] == 'u') {
textbuffer_putc(&scanner->buf, c);
break;
}
if (scanner->pos == scanner->insize) {
scanner->pos++;
}
/* else fallthrough */
default:
/* character is no longer part of identifier */
scanner->state = Initial;
textbuffer_putc(&scanner->buf, '\0');
scanner->pos--;
kw = in_word_set(scanner->buf.data, scanner->buf.pos - 1);
if (kw) {
/* we got a keyword */
TOKEN_SET(lvalp, cstring, kw->name);
return kw->val;
}
/* it is not a keyword, just an identifier */
TOKEN_SET(lvalp, cstring, NULL);
return TOK_IDENTIFIER_NAME;
}
}
scanner->state = Identifier;
return 0;
}
static int parseOperator(YYSTYPE *lvalp, yyscan_t scanner)
{
size_t len = MIN(5, scanner->insize - scanner->pos);
while (len) {
CMake: Add CTest support to match Autotools checks An ENABLE_TESTS CMake option is provided so that users can disable testing if they don't want it. Instructions for how to use this included in the INSTALL.cmake.md file. If you run `ctest`, each testcase will write out a log file to the <build>/unit_tests directory. As with Autotools' make check, the test files are from test/.split and unit_tests/.split files, but for CMake these are generated at build time instead of at test time. On Posix systems, sets the LD_LIBRARY_PATH so that ClamAV-compiled libraries can be loaded when running tests. On Windows systems, CTest will identify and collect all library dependencies and assemble a temporarily install under the build/unit_tests directory so that the libraries can be loaded when running tests. The same feature is used on Windows when using CMake to install to collect all DLL dependencies so that users don't have to install them manually afterwards. Each of the CTest tests are run using a custom wrapper around Python's unittest framework, which is also responsible for finding and inserting valgrind into the valgrind tests on Posix systems. Unlike with Autotools, the CMake CTest Valgrind-tests are enabled by default, if Valgrind can be found. There's no need to set VG=1. CTest's memcheck module is NOT supported, because we use Python to orchestrate our tests. Added a bunch of Windows compatibility changes to the unit tests. These were primarily changing / to PATHSEP and making adjustments to use Win32 C headers and ifdef out the POSIX ones which aren't available on Windows. Also disabled a bunch of tests on Win32 that don't work on Windows, notably the mmap ones and FD-passing (i.e. FILEDES) ones. Add JSON_C_HAVE_INTTYPES_H definition to clamav-config.h to eliminate warnings on Windows where json.h is included after inttypes.h because json-c's inttypes replacement relies on it. This is a it of a hack and may be removed if json-c fixes their inttypes header stuff in the future. Add preprocessor definitions on Windows to disable MSVC warnings about CRT secure and nonstandard functions. While there may be a better solution, this is needed to be able to see other more serious warnings. Add missing file comment block and copyright statement for clamsubmit.c. Also change json-c/json.h include filename to json.h in clamsubmit.c. The directory name is not required. Changed the hash table data integer type from long, which is poorly defined, to size_t -- which is capable of storing a pointer. Fixed a bunch of casts regarding this variable to eliminate warnings. Fixed two bugs causing utf8 encoding unit tests to fail on Windows: - The in_size variable should be the number of bytes, not the character count. This was was causing the SHIFT_JIS (japanese codepage) to UTF8 transcoding test to only transcode half the bytes. - It turns out that the MultiByteToWideChar() API can't transcode UTF16-BE to UTF16-LE. The solution is to just iterate over the buffer and flip the bytes on each uint16_t. This but was causing the UTF16-BE to UTF8 tests to fail. I also split up the utf8 transcoding tests into separate tests so I could see all of the failures instead of just the first one. Added a flags parameter to the unit test function to open testfiles because it turns out that on Windows if a file contains the \r\n it will replace it with just \n if you opened the file as a text file instead of as binary. However, if we open the CBC files as binary, then a bunch of bytecode tests fail. So I've changed the tests to open the CBC files in the bytecode tests as text files and open all other files as binary. Ported the feature tests from shell scripts to Python using a modified version of our QA test-framework, which is largely compatible and will allow us to migrate some QA tests into this repo. I'd like to add GitHub Actions pipelines in the future so that all public PR's get some testing before anyone has to manually review them. The clamd --log option was missing from the help string, though it definitely works. I've added it in this commit. It appears that clamd.c was never clang-format'd, so this commit also reformats clamd.c. Some of the check_clamd tests expected the path returned by clamd to match character for character with original path sent to clamd. However, as we now evaluate real paths before a scan, the path returned by clamd isn't going to match the relative (and possibly symlink-ridden) path passed to clamdscan. I fixed this test by changing the test to search for the basename: <signature> FOUND within the response instead of matching the exact path. Autotools: Link check_clamd with libclamav so we can use our utility functions in check_clamd.c.
2020-08-25 23:14:23 -07:00
const struct operator* kw = in_op_set(&scanner->in[scanner->pos], len);
if (kw) {
TOKEN_SET(lvalp, cstring, kw->name);
scanner->pos += len;
return kw->val;
}
len--;
}
/* never reached */
assert(0);
scanner->pos++;
TOKEN_SET(lvalp, cstring, NULL);
return TOK_ERROR;
}
static int yylex_init(yyscan_t *scanner)
{
*scanner = calloc(1, sizeof(**scanner));
return *scanner ? 0 : -1;
}
static int yylex_destroy(yyscan_t scanner)
{
free(scanner->buf.data);
free(scanner);
return 0;
}
static void yy_scan_bytes(const char *p, size_t len, yyscan_t scanner)
{
scanner->in = p;
scanner->insize = len;
scanner->pos = 0;
scanner->lastpos = -1;
scanner->last_state = Dummy;
return;
}
static const char *yyget_text(yyscan_t scanner)
{
return scanner->yytext ? scanner->yytext : scanner->buf.data;
}
static int yyget_leng(yyscan_t scanner)
{
/* we have a \0 too */
return scanner->yylen ? scanner->yylen : (scanner->buf.pos > 0 ? scanner->buf.pos - 1 : 0);
}
static int yylex(YYSTYPE *lvalp, yyscan_t scanner)
{
const size_t len = scanner->insize;
const unsigned char *in = (const unsigned char *)scanner->in;
unsigned char lookahead;
enum char_class cClass;
scanner->yytext = NULL;
scanner->yylen = 0;
if (scanner->pos == scanner->lastpos) {
if (scanner->last_state == scanner->state) {
cli_dbgmsg(MODULE "infloop detected, skipping character\n");
scanner->pos++;
}
/* it's not necessarily an infloop if it changed
* state, and it shouldn't infloop between states */
}
scanner->lastpos = scanner->pos;
scanner->last_state = scanner->state;
while (scanner->pos < scanner->insize) {
switch (scanner->state) {
case Initial:
textbuf_clean(&scanner->buf);
cClass = ctype[in[scanner->pos++]];
switch (cClass) {
case Whitespace:
/* eat whitespace */
continue;
case Slash:
if (scanner->pos < len) {
lookahead = in[scanner->pos];
switch (lookahead) {
case '*':
scanner->state = MultilineComment;
scanner->pos++;
continue;
case '/':
scanner->state = SinglelineComment;
scanner->pos++;
continue;
}
}
--scanner->pos;
return parseOperator(lvalp, scanner);
case Operator:
--scanner->pos;
return parseOperator(lvalp, scanner);
case DQuote:
return parseDQString(lvalp, scanner);
case SQuote:
return parseSQString(lvalp, scanner);
case Digit:
--scanner->pos;
return parseNumber(lvalp, scanner);
case IdStart:
--scanner->pos;
return parseId(lvalp, scanner);
CASE_SPECIAL_CHAR(BracketOpen, "[");
CASE_SPECIAL_CHAR(BracketClose, "]");
CASE_SPECIAL_CHAR(Comma, ",");
CASE_SPECIAL_CHAR(CurlyOpen, "{");
CASE_SPECIAL_CHAR(CurlyClose, "}");
CASE_SPECIAL_CHAR(ParOpen, "(");
CASE_SPECIAL_CHAR(ParClose, ")");
CASE_SPECIAL_CHAR(Dot, ".");
CASE_SPECIAL_CHAR(SemiColon, ";");
case Nop:
continue;
}
break;
case DoubleQString:
return parseString(lvalp, scanner, '"', DoubleQString);
case SingleQString:
return parseString(lvalp, scanner, '\'', SingleQString);
case Identifier:
return parseId(lvalp, scanner);
case MultilineComment:
while (scanner->pos + 1 < scanner->insize) {
if (in[scanner->pos] == '*' && in[scanner->pos + 1] == '/') {
scanner->state = Initial;
scanner->pos++;
break;
}
scanner->pos++;
}
scanner->pos++;
break;
case Number:
return parseNumber(lvalp, scanner);
case SinglelineComment:
while (scanner->pos < scanner->insize) {
/* htmlnorm converts \n to space, so
* stop on space too */
if (in[scanner->pos] == '\n' || in[scanner->pos] == ' ')
break;
scanner->pos++;
}
scanner->state = Initial;
break;
default:
assert(0 && "Not reached");
}
}
return 0;
}