libclamav: Add URI scanning support to PDF parser

Threat Research requests scanning URIs in PDF files and adding them to
the json report file.

This change adds URI scanning support to the PDF parser, including
support for object references to URIs in PDF files.

Jira: CLAM-2588

Fix out-of-order references and other minor improvements.

CLAM-2588, CLAM-2757
This commit is contained in:
John Humlick 2025-04-07 16:50:09 -07:00
parent 492e505070
commit e1e3d4c64d
No known key found for this signature in database
GPG key ID: 543839C3681B11C1
14 changed files with 485 additions and 111 deletions

View file

@ -254,8 +254,10 @@ void help(void)
mprintf(LOGG_INFO, " --gen-json[=yes/no(*)] Generate JSON metadata for the scanned file(s). For testing & development use ONLY.\n");
mprintf(LOGG_INFO, " JSON will be printed if --debug is enabled.\n");
mprintf(LOGG_INFO, " A JSON file will dropped to the temp directory if --leave-temps is enabled.\n");
mprintf(LOGG_INFO, " --json-store-html-urls[=yes(*)/no] Store html URLs in metadata.\n");
mprintf(LOGG_INFO, " URLs will be written to the metadata.json file in an array called 'HTMLUrls'\n");
mprintf(LOGG_INFO, " --json-store-html-uris[=yes(*)/no] Store html URIs in metadata.\n");
mprintf(LOGG_INFO, " URLs will be written to the metadata.json file in an array called 'URIs'\n");
mprintf(LOGG_INFO, " --json-store-pdf-uris[=yes(*)/no] Store pdf URIs in metadata.\n");
mprintf(LOGG_INFO, " URLs will be written to the metadata.json file in an array called 'URIs'\n");
mprintf(LOGG_INFO, " --database=FILE/DIR -d FILE/DIR Load virus database from FILE or load all supported db files from DIR\n");
mprintf(LOGG_INFO, " --official-db-only[=yes/no(*)] Only load official signatures\n");
mprintf(LOGG_INFO, " --fail-if-cvd-older-than=days Return with a nonzero error code if virus database outdated.\n");

View file

@ -1574,8 +1574,12 @@ int scanmanager(const struct optstruct *opts)
options.general |= CL_SCAN_GENERAL_HEURISTICS;
}
if (optget(opts, "json-store-html-urls")->enabled) {
options.general |= CL_SCAN_GENERAL_STORE_HTML_URLS;
if (optget(opts, "json-store-html-uris")->enabled) {
options.general |= CL_SCAN_GENERAL_STORE_HTML_URIS;
}
if (optget(opts, "json-store-pdf-uris")->enabled) {
options.general |= CL_SCAN_GENERAL_STORE_PDF_URIS;
}
/* TODO: Remove deprecated option in a future feature release */

View file

@ -389,7 +389,8 @@ const struct clam_option __clam_options[] = {
{"PhishingScanURLs", "phishing-scan-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Scan URLs found in mails for phishing attempts using heuristics.", "yes"},
{"HeuristicAlerts", "heuristic-alerts", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "In some cases (eg. complex malware, exploits in graphic files, and others),\nClamAV uses special algorithms to provide accurate detection. This option\ncontrols the algorithmic detection.", "yes"},
{"JsonStoreHTMLUrls", "json-store-html-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in HTML <form and <a tags.", "yes"},
{"JsonStoreHTMLURIs", "json-store-html-uris", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in HTML <form and <a tags.", "yes"},
{"JsonStorePDFURIs", "json-store-pdf-uris", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in PDF /URI tags.", "yes"},
{"HeuristicScanPrecedence", "heuristic-scan-precedence", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 0, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Allow heuristic match to take precedence.\nWhen enabled, if a heuristic scan (such as phishingScan) detects\na possible virus/phish it will stop scan immediately. Recommended, saves CPU\nscan-time.\nWhen disabled, virus/phish detected by heuristic scans will be reported only\nat the end of a scan. If an archive contains both a heuristically detected\nvirus/phish, and a real malware, the real malware will be reported.\nKeep this disabled if you intend to handle \"Heuristics.*\" viruses\ndifferently from \"real\" malware.\nIf a non-heuristically-detected virus (signature-based) is found first,\nthe scan is interrupted immediately, regardless of this config option.", "yes"},

View file

@ -254,11 +254,17 @@ Example
# Default: no
#GenerateMetadataJson yes
# Store URLs found in html files to the json metadata.
# URLs will be stored in an array with the tag 'HTMLUrls'
# Store URIs found in html files to the json metadata.
# URIs will be stored in an array with the tag 'URIs'
# GenerateMetadataJson is required for this feature.
# Default: yes (if GenerateMetadataJson is used)
#JsonStoreHTMLUrls no
#JsonStoreHTMLURIs no
# Store URIs found in pdf files to the json metadata.
# URIs will be stored in an array with the tag 'URIs'
# GenerateMetadataJson is required for this feature.
# Default: yes (if GenerateMetadataJson is used)
#JsonStorePDFURIs no
# Permit use of the ALLMATCHSCAN command. If set to no, clamd will reject
# any ALLMATCHSCAN command as invalid.

View file

@ -263,7 +263,8 @@ enum pdf_objflags {
OBJ_FILTER_STANDARD, /* */
OBJ_LAUNCHACTION, /* */
OBJ_PAGE, /* */
OBJ_CONTENTS /* */
OBJ_CONTENTS, /* */
OBJ_URI /* */
};
/**

View file

@ -61,6 +61,11 @@
#endif
/* Apple does not define __pid_t */
#ifdef __APPLE__
typedef pid_t __pid_t;
#endif
#define UNUSEDPARAM(x) (void)(x)
#include <sys/types.h>
@ -168,7 +173,8 @@ struct cl_scan_options {
#define CL_SCAN_GENERAL_HEURISTICS 0x4 /* option to enable heuristic alerts */
#define CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE 0x8 /* allow heuristic match to take precedence. */
#define CL_SCAN_GENERAL_UNPRIVILEGED 0x10 /* scanner will not have read access to files. */
#define CL_SCAN_GENERAL_STORE_HTML_URLS 0x20 /* Store urls found in html <a and <form tags when recording JSON metadata */
#define CL_SCAN_GENERAL_STORE_HTML_URIS 0x20 /* Store uris found in html <a and <form tags when recording JSON metadata */
#define CL_SCAN_GENERAL_STORE_PDF_URIS 0x40 /* Store uris found in pdf /URI tags when recording JSON metadata */
/* parsing capabilities options */
#define CL_SCAN_PARSE_ARCHIVE 0x1

View file

@ -552,7 +552,8 @@ extern LIBCLAMAV_EXPORT int have_rar;
#define SCAN_HEURISTICS (ctx->options->general & CL_SCAN_GENERAL_HEURISTICS)
#define SCAN_HEURISTIC_PRECEDENCE (ctx->options->general & CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE)
#define SCAN_UNPRIVILEGED (ctx->options->general & CL_SCAN_GENERAL_UNPRIVILEGED)
#define SCAN_STORE_HTML_URLS (ctx->options->general & CL_SCAN_GENERAL_STORE_HTML_URLS)
#define SCAN_STORE_HTML_URIS (ctx->options->general & CL_SCAN_GENERAL_STORE_HTML_URIS)
#define SCAN_STORE_PDF_URIS (ctx->options->general & CL_SCAN_GENERAL_STORE_PDF_URIS)
#define SCAN_PARSE_ARCHIVE (ctx->options->parse & CL_SCAN_PARSE_ARCHIVE)
#define SCAN_PARSE_ELF (ctx->options->parse & CL_SCAN_PARSE_ELF)

View file

@ -116,6 +116,7 @@ static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam
static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
static void URI_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
/* End PDF statistics callbacks and related */
@ -1446,22 +1447,28 @@ static int pdf_scan_contents(int fd, struct pdf_struct *pdf, struct pdf_obj *obj
cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
{
cl_error_t status = CL_SUCCESS;
cl_error_t ret;
char fullname[PATH_MAX + 1];
int fout = -1;
size_t sum = 0;
cl_error_t rc = CL_SUCCESS;
int dump = 1;
bool extracted_an_object = false;
int fout = -1;
size_t sum = 0;
bool dump = true;
struct pdf_dict *dparams = NULL;
cli_dbgmsg("pdf_extract_obj: obj %u %u\n", obj->id >> 8, obj->id & 0xff);
if (PDF_OBJECT_RECURSION_LIMIT < pdf->parse_recursion_depth) {
cli_dbgmsg("pdf_extract_obj: Recursion limit reached.\n");
return CL_SUCCESS;
status = CL_SUCCESS;
goto done;
}
if (obj->extracted) {
// Should not attempt to extract the same object more than once.
return CL_SUCCESS;
status = CL_SUCCESS;
goto done;
}
// We're not done yet, but this is enough to say we've tried.
// Trying again won't help any.
@ -1471,28 +1478,38 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
cli_dbgmsg("pdf_extract_obj: extracting obj found in objstm.\n");
if (obj->objstm->streambuf == NULL) {
cli_warnmsg("pdf_extract_obj: object in object stream has null stream buffer!\n");
return CL_EFORMAT;
status = CL_EFORMAT;
goto done;
}
}
/* Check to see if this is a URI referenced from a prior URI object */
if (obj->flags & (1 << OBJ_URI)) {
URI_cb(pdf, obj, NULL);
status = CL_SUCCESS;
goto done;
}
/* TODO: call bytecode hook here, allow override dumpability */
if ((!(obj->flags & (1 << OBJ_STREAM)) || (obj->flags & (1 << OBJ_HASFILTERS))) && !(obj->flags & DUMP_MASK)) {
/* don't dump all streams */
dump = 0;
dump = false;
}
if ((obj->flags & (1 << OBJ_IMAGE)) && !(obj->flags & (1 << OBJ_FILTER_DCT))) {
/* don't dump / scan non-JPG images */
dump = 0;
dump = false;
}
if (obj->flags & (1 << OBJ_FORCEDUMP)) {
/* bytecode can force dump by setting this flag */
dump = 1;
dump = true;
}
if (!dump)
return CL_CLEAN;
if (!dump) {
status = CL_SUCCESS;
goto done;
}
cli_dbgmsg("pdf_extract_obj: dumping obj %u %u\n", obj->id >> 8, obj->id & 0xff);
@ -1501,11 +1518,17 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
if (fout < 0) {
char err[128];
cli_errmsg("pdf_extract_obj: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err)));
return CL_ETMPFILE;
status = CL_ETMPFILE;
goto done;
}
extracted_an_object = true;
if (!(flags & PDF_EXTRACT_OBJ_SCAN)) {
/*
* When PDF_EXTRACT_OBJ_SCAN is not set, this function is used to extract the object to a temp file
* and so we need to save off the path in obj->path for the caller to use.
*/
if (NULL != obj->path) {
obj->path = strdup(fullname);
}
@ -1525,7 +1548,6 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
int dict_len = obj->stream - start; /* Dictionary should end where the stream begins */
const char *pstr;
struct pdf_dict *dparams = NULL;
struct objstm_struct *objstm = NULL;
int xref = 0;
@ -1582,7 +1604,10 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
length = obj->stream_size;
if (0 == length) {
cli_dbgmsg("pdf_extract_obj: Alleged or calculated stream length and stream buffer size both 0\n");
goto done; /* Empty stream, nothing to scan */
/* Empty stream, nothing to scan */
status = CL_SUCCESS;
goto done;
}
}
@ -1647,15 +1672,15 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms);
if (!pdf->objstms) {
cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
pdf_free_dict(dparams);
return CL_EMEM;
status = CL_EMEM;
goto done;
}
objstm = malloc(sizeof(struct objstm_struct));
if (!objstm) {
cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
pdf_free_dict(dparams);
return CL_EMEM;
status = CL_EMEM;
goto done;
}
pdf->objstms[pdf->nobjstms - 1] = objstm;
@ -1673,18 +1698,18 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
}
}
sum = pdf_decodestream(pdf, obj, dparams, obj->stream, (uint32_t)length, xref, fout, &rc, objstm);
if ((CL_SUCCESS != rc) && (CL_VIRUS != rc)) {
cli_dbgmsg("Error decoding stream! Error code: %d\n", rc);
sum = pdf_decodestream(pdf, obj, dparams, obj->stream, (uint32_t)length, xref, fout, &status, objstm);
if ((CL_SUCCESS != status) && (CL_VIRUS != status)) {
cli_dbgmsg("Error decoding stream! Error code: %d\n", status);
/* It's ok if we couldn't decode the stream,
* make a best effort to keep parsing...
* Unless we were unable to allocate memory.*/
if (CL_EMEM == rc) {
goto really_done;
if (CL_EMEM == status) {
goto done;
}
if (CL_EPARSE == rc) {
rc = CL_SUCCESS;
if (CL_EPARSE == status) {
status = CL_SUCCESS;
}
if (NULL != objstm) {
@ -1713,7 +1738,8 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
if (!pdf->objstms) {
cli_warnmsg("pdf_extract_obj: out of memory when shrinking down objstm array\n");
return CL_EMEM;
status = CL_EMEM;
goto done;
}
}
} else {
@ -1724,11 +1750,13 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
}
}
if (dparams)
if (dparams) {
pdf_free_dict(dparams);
dparams = NULL;
}
if (rc == CL_VIRUS) {
sum = 0; /* prevents post-filter scan */
if (status == CL_VIRUS) {
/* skip post-filter scan */
goto done;
}
@ -1741,7 +1769,7 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
off_t bytesleft = obj->size;
if (bytesleft < 0) {
goto done;
goto scan_extracted_objects;
}
do {
@ -1789,7 +1817,7 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
pdf->stats.njs++;
if (filter_writen(pdf, obj, fout, out, js_len, (size_t *)&sum) != js_len) {
rc = CL_EWRITE;
status = CL_EWRITE;
free(js);
break;
}
@ -1824,64 +1852,81 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
off_t bytesleft = obj->size;
if (bytesleft < 0)
rc = CL_EFORMAT;
status = CL_EFORMAT;
else {
if (obj->objstm) {
if (filter_writen(pdf, obj, fout, obj->objstm->streambuf + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft)
rc = CL_EWRITE;
if (filter_writen(pdf, obj, fout, obj->objstm->streambuf + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft) {
status = CL_EWRITE;
}
} else {
if (filter_writen(pdf, obj, fout, pdf->map + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft)
rc = CL_EWRITE;
if (filter_writen(pdf, obj, fout, pdf->map + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft) {
status = CL_EWRITE;
}
}
}
}
scan_extracted_objects:
cli_dbgmsg("pdf_extract_obj: extracted %td bytes %u %u obj\n", sum, obj->id >> 8, obj->id & 0xff);
cli_dbgmsg("pdf_extract_obj: ... to %s\n", fullname);
if ((flags & PDF_EXTRACT_OBJ_SCAN) && (sum > 0)) {
/*
* Scan the extracted objects for potential threats.
* PDF_EXTRACT_OBJ_SCAN is used when the extracted object should be scanned and then deleted.
*/
/* TODO: invoke bytecode on this pdf obj with metainformation associated */
lseek(fout, 0, SEEK_SET);
ret = cli_magic_scan_desc(fout, fullname, pdf->ctx, NULL, LAYER_ATTRIBUTES_NONE);
if (ret != CL_SUCCESS) {
status = ret;
goto done;
}
if ((status == CL_CLEAN) || (status == CL_VIRUS)) {
ret = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout);
if (ret == CL_VIRUS) {
status = ret;
goto done;
}
}
if (((status == CL_CLEAN) || (status == CL_VIRUS)) && (obj->flags & (1 << OBJ_CONTENTS))) {
lseek(fout, 0, SEEK_SET);
cli_dbgmsg("pdf_extract_obj: dumping contents from obj %u %u\n", obj->id >> 8, obj->id & 0xff);
ret = pdf_scan_contents(fout, pdf, obj);
if (ret != CL_SUCCESS) {
status = ret;
goto done;
}
}
}
done:
cli_dbgmsg("pdf_extract_obj: extracted %td bytes %u %u obj\n", sum, obj->id >> 8, obj->id & 0xff);
cli_dbgmsg("pdf_extract_obj: ... to %s\n", fullname);
if (NULL != dparams) {
pdf_free_dict(dparams);
}
if (flags & PDF_EXTRACT_OBJ_SCAN && sum) {
int rc2;
if (-1 != fout) {
close(fout);
}
/* TODO: invoke bytecode on this pdf obj with metainformation associated */
lseek(fout, 0, SEEK_SET);
rc2 = cli_magic_scan_desc(fout, fullname, pdf->ctx, NULL, LAYER_ATTRIBUTES_NONE);
if (rc2 != CL_SUCCESS) {
rc = rc2;
goto really_done;
}
if ((rc == CL_CLEAN) || (rc == CL_VIRUS)) {
rc2 = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout);
if (rc2 == CL_VIRUS) {
rc = rc2;
goto really_done;
}
}
if (((rc == CL_CLEAN) || (rc == CL_VIRUS)) && (obj->flags & (1 << OBJ_CONTENTS))) {
lseek(fout, 0, SEEK_SET);
cli_dbgmsg("pdf_extract_obj: dumping contents from obj %u %u\n", obj->id >> 8, obj->id & 0xff);
rc2 = pdf_scan_contents(fout, pdf, obj);
if (rc2 != CL_SUCCESS) {
rc = rc2;
goto really_done;
}
if (extracted_an_object && (flags & PDF_EXTRACT_OBJ_SCAN) && !pdf->ctx->engine->keeptmp) {
/*
* When PDF_EXTRACT_OBJ_SCAN is set, the goal is to extract, scan, and delete it.
* If it was not set, we would keep it and the path is passed back obj->path for the caller to use.
* That's why we wouldn't unlink it here.
*/
if (cli_unlink(fullname) && status != CL_VIRUS) {
status = CL_EUNLINK;
}
}
really_done:
close(fout);
if (CL_EMEM != rc) {
if (flags & PDF_EXTRACT_OBJ_SCAN && !pdf->ctx->engine->keeptmp)
if (cli_unlink(fullname) && rc != CL_VIRUS)
rc = CL_EUNLINK;
}
return rc;
return status;
}
enum objstate {
@ -1893,6 +1938,7 @@ enum objstate {
STATE_LINEARIZED,
STATE_LAUNCHACTION,
STATE_CONTENTS,
STATE_URI,
STATE_ANY /* for actions table below */
};
@ -1954,7 +2000,8 @@ static struct pdfname_action pdfname_actions[] = {
{"Colors", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Colors_cb},
{"RichMedia", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, RichMedia_cb},
{"AcroForm", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, AcroForm_cb},
{"XFA", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, XFA_cb}};
{"XFA", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, XFA_cb},
{"URI", OBJ_DICT, STATE_NONE, STATE_URI, NAMEFLAG_NONE, URI_cb}};
#define KNOWN_FILTERS ((1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_RL) | (1 << OBJ_FILTER_A85) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_LZW) | (1 << OBJ_FILTER_FAX) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_JPX) | (1 << OBJ_FILTER_CRYPT))
@ -1963,12 +2010,24 @@ static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj, const ch
struct pdfname_action *act = NULL;
unsigned j;
// If we process STATE_S we will get duplicate URIs from the prior STATE_NONE
if (!strcmp(pdfname, "URI") && *state == STATE_S) {
*state = STATE_NONE;
return;
}
obj->statsflags |= OBJ_FLAG_PDFNAME_DONE;
for (j = 0; j < sizeof(pdfname_actions) / sizeof(pdfname_actions[0]); j++) {
if (!strcmp(pdfname, pdfname_actions[j].pdfname)) {
act = &pdfname_actions[j];
break;
// Check to see if this object was observed to be a reference to a URI
if (obj->flags & (1 << OBJ_URI)) {
act = &(struct pdfname_action){"URI", OBJ_DICT, STATE_ANY, STATE_URI, NAMEFLAG_NONE, URI_cb};
}
if (!act) {
for (j = 0; j < sizeof(pdfname_actions) / sizeof(pdfname_actions[0]); j++) {
if (!strcmp(pdfname, pdfname_actions[j].pdfname)) {
act = &pdfname_actions[j];
break;
}
}
}
@ -2101,7 +2160,7 @@ static void pdf_parse_trailer(struct pdf_struct *pdf, const char *s, long length
void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
{
/* enough to hold common pdf names, we don't need all the names */
char pdfname[64];
char pdfname[64] = {0};
const char *q2, *q3;
const char *nextobj = NULL, *nextopen = NULL, *nextclose = NULL;
const char *q = NULL;
@ -2382,7 +2441,10 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
if (objstate == STATE_LAUNCHACTION)
pdfobj_flag(pdf, obj, HAS_LAUNCHACTION);
if (dict_length > 0 && (objstate == STATE_JAVASCRIPT || objstate == STATE_OPENACTION || objstate == STATE_CONTENTS)) {
if (dict_length > 0 && (objstate == STATE_JAVASCRIPT ||
objstate == STATE_OPENACTION ||
objstate == STATE_CONTENTS ||
objstate == STATE_URI)) {
off_t dict_remaining = dict_length;
if (objstate == STATE_OPENACTION)
@ -2447,6 +2509,9 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
case STATE_CONTENTS:
flag = OBJ_CONTENTS;
break;
case STATE_URI:
flag = OBJ_URI;
break;
default:
cli_dbgmsg("pdf_parseobj: Unexpected object type\n");
return;
@ -4669,6 +4734,78 @@ static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam
cli_jsonint_array(colorsobj, obj->id >> 8);
}
static void URI_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
cli_ctx *ctx = NULL;
off_t bytesleft = 0;
char *uri_start = NULL;
char *uri_heap = NULL;
const char *objstart = NULL;
json_object *uriarr = NULL;
UNUSEDPARAM(act);
if (!(pdf) || !(pdf->ctx) || !(pdf->ctx->wrkproperty) || !obj) {
return;
}
objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
ctx = pdf->ctx;
if (!(SCAN_COLLECT_METADATA) || !(SCAN_STORE_PDF_URIS)) {
return;
}
if (obj->size == 0) {
return;
}
if (obj->objstm) {
bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start);
} else {
bytesleft = MIN(obj->size, pdf->size - obj->start);
}
// Advance forward to the first '(' character
size_t start = 0;
while (bytesleft > 0 && objstart[start] != '(') {
start++;
bytesleft--;
}
if (bytesleft == 0) {
return;
}
// The first character past '(' is the start of the URI
uri_start = (char *)(objstart + start + 1);
bytesleft--;
// Advance forward to the first ')' character
size_t end = 0;
while (bytesleft > 0 && uri_start[end] != ')') {
end++;
bytesleft--;
}
if (uri_start[end] != ')') {
return;
}
// Create a new string containing only the URI
CLI_MAX_MALLOC_OR_GOTO_DONE(uri_heap, end + 1,
cli_errmsg("cli_pdf: malloc() failed (URI)\n"));
strncpy(uri_heap, uri_start, end);
uri_heap[end] = '\0';
uriarr = cli_jsonarray(pdf->ctx->wrkproperty, "URIs");
if (!uriarr) {
cli_errmsg("cli_pdf: malloc() failed (URI array)\n");
goto done;
}
cli_jsonstr(uriarr, NULL, uri_heap);
done:
free(uri_heap);
}
static void pdf_free_stats(struct pdf_struct *pdf)
{

View file

@ -2082,7 +2082,7 @@ done:
return ret;
}
const char *const HTML_URLS_JSON_KEY = "HTMLUrls";
const char *const HTML_URIS_JSON_KEY = "URIs";
/* https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml */
const char *URI_LIST[] = {
"aaa://",
@ -2495,7 +2495,7 @@ static void save_urls(cli_ctx *ctx, tag_arguments_t *hrefs, form_data_t *form_da
return;
}
if (!(SCAN_STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL))) {
if (!(SCAN_STORE_HTML_URIS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL))) {
return;
}
@ -2503,9 +2503,9 @@ static void save_urls(cli_ctx *ctx, tag_arguments_t *hrefs, form_data_t *form_da
for (i = 0; i < hrefs->count; i++) {
if (is_url((const char *)hrefs->value[i], strlen((const char *)hrefs->value[i]))) {
if (NULL == ary) {
ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY);
ary = cli_jsonarray(ctx->wrkproperty, HTML_URIS_JSON_KEY);
if (!ary) {
cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY);
cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URIS_JSON_KEY);
return;
}
}
@ -2517,9 +2517,9 @@ static void save_urls(cli_ctx *ctx, tag_arguments_t *hrefs, form_data_t *form_da
for (i = 0; i < (int)form_data->count; i++) {
if (is_url((const char *)form_data->urls[i], strlen((const char *)form_data->urls[i]))) {
if (NULL == ary) {
ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY);
ary = cli_jsonarray(ctx->wrkproperty, HTML_URIS_JSON_KEY);
if (!ary) {
cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY);
cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URIS_JSON_KEY);
return;
}
}
@ -2560,7 +2560,7 @@ static cl_error_t cli_scanhtml(cli_ctx *ctx)
cli_dbgmsg("cli_scanhtml: using tempdir %s\n", tempname);
/* Output JSON Summary Information */
if (SCAN_STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL)) {
if (SCAN_STORE_HTML_URIS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL)) {
tag_arguments_t hrefs = {0};
hrefs.scanContents = 1;
form_data_t form_data = {0};
@ -4311,7 +4311,7 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi
free_duplicate_fmap(new_map);
}
} // end check for embedded files
} // end if (fpt->offset > 0)
} // end if (fpt->offset > 0)
if ((nret == CL_EMEM) ||
(ctx->abort_scan) ||

View file

@ -39,7 +39,7 @@ class TC(testcase.TestCase):
tempdir=self.path_tmp / "TD"
if not os.path.isdir(tempdir):
os.makedirs(tempdir);
os.makedirs(tempdir)
testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'html' / 'index.html'
command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format(
@ -52,8 +52,9 @@ class TC(testcase.TestCase):
assert output.ec == 0 # clean
expected_strings = [ 'HTMLUrls'
, '"https://www.clamav.net/reports/malware"'
, '"http://www.google.com"'
]
expected_strings = [
'URIs',
'"https://www.clamav.net/reports/malware"',
'"http://www.google.com"'
]
self.verify_metadata_json(tempdir, expected_strings)

View file

@ -0,0 +1,85 @@
# Copyright (C) 2020-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
"""
Run clamscan tests.
"""
import sys
import os
import re
import shutil
sys.path.append('../unit_tests')
import testcase
class TC(testcase.TestCase):
@classmethod
def setUpClass(cls):
super(TC, cls).setUpClass()
@classmethod
def tearDownClass(cls):
super(TC, cls).tearDownClass()
def setUp(self):
super(TC, self).setUp()
def tearDown(self):
super(TC, self).tearDown()
# Remove scan temps directory between tests
if (self.path_tmp / "TD").exists():
shutil.rmtree(self.path_tmp / "TD")
self.verify_valgrind_log()
def test_save_links(self):
self.step_name('Extract Links')
tempdir=self.path_tmp / "TD"
if not os.path.isdir(tempdir):
os.makedirs(tempdir)
testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'pdf' / 'uri-and-ref.pdf'
command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format(
valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb',
tempdir=tempdir,
testfile=testfile,
)
output = self.execute_command(command)
assert output.ec == 0 # clean
expected_strings = [
'URIs',
'"https://docs.clamav.net/manual/Development.html"',
'"https://docs.clamav.net/"'
]
self.verify_metadata_json(tempdir, expected_strings)
def test_out_of_order_links(self):
self.step_name('Out-of-Order Links')
tempdir=self.path_tmp / "TD"
if not os.path.isdir(tempdir):
os.makedirs(tempdir)
testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'pdf' / 'out-of-order.pdf'
command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format(
valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb',
tempdir=tempdir,
testfile=testfile,
)
output = self.execute_command(command)
assert output.ec == 0 # clean
expected_strings = [
'URIs',
'"https://docs.clamav.net/manual/Development.html"',
'"https://docs.clamav.net/"'
]
self.verify_metadata_json(tempdir, expected_strings)

View file

@ -0,0 +1,62 @@
%PDF-1.4
1 0 obj
<< /Type /Catalog
/Outlines 2 0 R
/Pages 3 0 R
>>
endobj
2 0 obj
<< /Type Outlines
/Count 0
>>
endobj
3 0 obj
<< /Type /Pages
/Kids [ 4 0 R ]
/Count 1
>>
endobj
4 0 obj
<< /Type /Page
/Parent 3 0 R
/MediaBox [ 0 0 612 792 ]
/Contents 5 0 R
/Resources << /ProcSet 6 0 R >>
>>
endobj
26 0 obj
(https://docs.clamav.net/)
endobj
24 0 obj
<< /URI (https://docs.clamav.net/manual/Development.html)
/S /URI >>
endobj
25 0 obj
<< /Type /Action /S /URI /URI 26 0 R >>
endobj
5 0 obj
<< /Length 35 >>
stream Page-marking operators endstream
endobj
6 0 obj
[ /PDF ]
endobj
xref
0 7
0000000000 65535 f
0000000009 00000 n
0000000074 00000 n
0000000119 00000 n
0000000178 00000 n
0000000299 00000 n
0000000384 00000 n
0000000440 00000 n
0000000483 00000 n
0000000555 00000 n
trailer
<< /Size 7
/Root 1 0 R
>>
startxref
408
%%EOF

View file

@ -0,0 +1,62 @@
%PDF-1.4
1 0 obj
<< /Type /Catalog
/Outlines 2 0 R
/Pages 3 0 R
>>
endobj
2 0 obj
<< /Type Outlines
/Count 0
>>
endobj
3 0 obj
<< /Type /Pages
/Kids [ 4 0 R ]
/Count 1
>>
endobj
4 0 obj
<< /Type /Page
/Parent 3 0 R
/MediaBox [ 0 0 612 792 ]
/Contents 5 0 R
/Resources << /ProcSet 6 0 R >>
>>
endobj
24 0 obj
<< /URI (https://docs.clamav.net/manual/Development.html)
/S /URI >>
endobj
25 0 obj
<< /Type /Action /S /URI /URI 26 0 R >>
endobj
26 0 obj
(https://docs.clamav.net/)
endobj
5 0 obj
<< /Length 35 >>
stream Page-marking operators endstream
endobj
6 0 obj
[ /PDF ]
endobj
xref
0 7
0000000000 65535 f
0000000009 00000 n
0000000074 00000 n
0000000119 00000 n
0000000178 00000 n
0000000299 00000 n
0000000384 00000 n
0000000440 00000 n
0000000483 00000 n
0000000555 00000 n
trailer
<< /Size 7
/Root 1 0 R
>>
startxref
408
%%EOF

View file

@ -226,11 +226,17 @@ TCPAddr localhost
# Default: no
#GenerateMetadataJson yes
# Store URLs found in html files to the json metadata.
# URLs will be stored in an array with the tag 'HTMLUrls'
# Store URIs found in html files to the json metadata.
# URIs will be stored in an array with the tag 'URIs'
# GenerateMetadataJson is required for this feature.
# Default: yes (if GenerateMetadataJson is used)
#JsonStoreHTMLUrls no
#JsonStoreHTMLURIs no
# Store URIs found in pdf files to the json metadata.
# URIs will be stored in an array with the tag 'URIs'
# GenerateMetadataJson is required for this feature.
# Default: yes (if GenerateMetadataJson is used)
#JsonStorePDFURIs no
# Permit use of the ALLMATCHSCAN command. If set to no, clamd will reject
# any ALLMATCHSCAN command as invalid.