diff --git a/clamscan/clamscan.c b/clamscan/clamscan.c index 9bfd93587..83abe0fb8 100644 --- a/clamscan/clamscan.c +++ b/clamscan/clamscan.c @@ -254,8 +254,10 @@ void help(void) mprintf(LOGG_INFO, " --gen-json[=yes/no(*)] Generate JSON metadata for the scanned file(s). For testing & development use ONLY.\n"); mprintf(LOGG_INFO, " JSON will be printed if --debug is enabled.\n"); mprintf(LOGG_INFO, " A JSON file will dropped to the temp directory if --leave-temps is enabled.\n"); - mprintf(LOGG_INFO, " --json-store-html-urls[=yes(*)/no] Store html URLs in metadata.\n"); - mprintf(LOGG_INFO, " URLs will be written to the metadata.json file in an array called 'HTMLUrls'\n"); + mprintf(LOGG_INFO, " --json-store-html-uris[=yes(*)/no] Store html URIs in metadata.\n"); + mprintf(LOGG_INFO, " URLs will be written to the metadata.json file in an array called 'URIs'\n"); + mprintf(LOGG_INFO, " --json-store-pdf-uris[=yes(*)/no] Store pdf URIs in metadata.\n"); + mprintf(LOGG_INFO, " URLs will be written to the metadata.json file in an array called 'URIs'\n"); mprintf(LOGG_INFO, " --database=FILE/DIR -d FILE/DIR Load virus database from FILE or load all supported db files from DIR\n"); mprintf(LOGG_INFO, " --official-db-only[=yes/no(*)] Only load official signatures\n"); mprintf(LOGG_INFO, " --fail-if-cvd-older-than=days Return with a nonzero error code if virus database outdated.\n"); diff --git a/clamscan/manager.c b/clamscan/manager.c index d6b38a66d..d861ec88e 100644 --- a/clamscan/manager.c +++ b/clamscan/manager.c @@ -1574,8 +1574,12 @@ int scanmanager(const struct optstruct *opts) options.general |= CL_SCAN_GENERAL_HEURISTICS; } - if (optget(opts, "json-store-html-urls")->enabled) { - options.general |= CL_SCAN_GENERAL_STORE_HTML_URLS; + if (optget(opts, "json-store-html-uris")->enabled) { + options.general |= CL_SCAN_GENERAL_STORE_HTML_URIS; + } + + if (optget(opts, "json-store-pdf-uris")->enabled) { + options.general |= CL_SCAN_GENERAL_STORE_PDF_URIS; } /* TODO: Remove deprecated option in a future feature release */ diff --git a/common/optparser.c b/common/optparser.c index 5014f9e88..717011c53 100644 --- a/common/optparser.c +++ b/common/optparser.c @@ -389,7 +389,8 @@ const struct clam_option __clam_options[] = { {"PhishingScanURLs", "phishing-scan-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Scan URLs found in mails for phishing attempts using heuristics.", "yes"}, {"HeuristicAlerts", "heuristic-alerts", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "In some cases (eg. complex malware, exploits in graphic files, and others),\nClamAV uses special algorithms to provide accurate detection. This option\ncontrols the algorithmic detection.", "yes"}, - {"JsonStoreHTMLUrls", "json-store-html-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in HTML
@@ -168,7 +173,8 @@ struct cl_scan_options { #define CL_SCAN_GENERAL_HEURISTICS 0x4 /* option to enable heuristic alerts */ #define CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE 0x8 /* allow heuristic match to take precedence. */ #define CL_SCAN_GENERAL_UNPRIVILEGED 0x10 /* scanner will not have read access to files. */ -#define CL_SCAN_GENERAL_STORE_HTML_URLS 0x20 /* Store urls found in html options->general & CL_SCAN_GENERAL_HEURISTICS) #define SCAN_HEURISTIC_PRECEDENCE (ctx->options->general & CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE) #define SCAN_UNPRIVILEGED (ctx->options->general & CL_SCAN_GENERAL_UNPRIVILEGED) -#define SCAN_STORE_HTML_URLS (ctx->options->general & CL_SCAN_GENERAL_STORE_HTML_URLS) +#define SCAN_STORE_HTML_URIS (ctx->options->general & CL_SCAN_GENERAL_STORE_HTML_URIS) +#define SCAN_STORE_PDF_URIS (ctx->options->general & CL_SCAN_GENERAL_STORE_PDF_URIS) #define SCAN_PARSE_ARCHIVE (ctx->options->parse & CL_SCAN_PARSE_ARCHIVE) #define SCAN_PARSE_ELF (ctx->options->parse & CL_SCAN_PARSE_ELF) diff --git a/libclamav/pdf.c b/libclamav/pdf.c index 1edf273e7..adcc42351 100644 --- a/libclamav/pdf.c +++ b/libclamav/pdf.c @@ -116,6 +116,7 @@ static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); +static void URI_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); /* End PDF statistics callbacks and related */ @@ -1446,22 +1447,28 @@ static int pdf_scan_contents(int fd, struct pdf_struct *pdf, struct pdf_obj *obj cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags) { + cl_error_t status = CL_SUCCESS; + cl_error_t ret; + char fullname[PATH_MAX + 1]; - int fout = -1; - size_t sum = 0; - cl_error_t rc = CL_SUCCESS; - int dump = 1; + bool extracted_an_object = false; + int fout = -1; + size_t sum = 0; + bool dump = true; + struct pdf_dict *dparams = NULL; cli_dbgmsg("pdf_extract_obj: obj %u %u\n", obj->id >> 8, obj->id & 0xff); if (PDF_OBJECT_RECURSION_LIMIT < pdf->parse_recursion_depth) { cli_dbgmsg("pdf_extract_obj: Recursion limit reached.\n"); - return CL_SUCCESS; + status = CL_SUCCESS; + goto done; } if (obj->extracted) { // Should not attempt to extract the same object more than once. - return CL_SUCCESS; + status = CL_SUCCESS; + goto done; } // We're not done yet, but this is enough to say we've tried. // Trying again won't help any. @@ -1471,28 +1478,38 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t cli_dbgmsg("pdf_extract_obj: extracting obj found in objstm.\n"); if (obj->objstm->streambuf == NULL) { cli_warnmsg("pdf_extract_obj: object in object stream has null stream buffer!\n"); - return CL_EFORMAT; + status = CL_EFORMAT; + goto done; } } + /* Check to see if this is a URI referenced from a prior URI object */ + if (obj->flags & (1 << OBJ_URI)) { + URI_cb(pdf, obj, NULL); + status = CL_SUCCESS; + goto done; + } + /* TODO: call bytecode hook here, allow override dumpability */ if ((!(obj->flags & (1 << OBJ_STREAM)) || (obj->flags & (1 << OBJ_HASFILTERS))) && !(obj->flags & DUMP_MASK)) { /* don't dump all streams */ - dump = 0; + dump = false; } if ((obj->flags & (1 << OBJ_IMAGE)) && !(obj->flags & (1 << OBJ_FILTER_DCT))) { /* don't dump / scan non-JPG images */ - dump = 0; + dump = false; } if (obj->flags & (1 << OBJ_FORCEDUMP)) { /* bytecode can force dump by setting this flag */ - dump = 1; + dump = true; } - if (!dump) - return CL_CLEAN; + if (!dump) { + status = CL_SUCCESS; + goto done; + } cli_dbgmsg("pdf_extract_obj: dumping obj %u %u\n", obj->id >> 8, obj->id & 0xff); @@ -1501,11 +1518,17 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t if (fout < 0) { char err[128]; cli_errmsg("pdf_extract_obj: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err))); - - return CL_ETMPFILE; + status = CL_ETMPFILE; + goto done; } + extracted_an_object = true; + if (!(flags & PDF_EXTRACT_OBJ_SCAN)) { + /* + * When PDF_EXTRACT_OBJ_SCAN is not set, this function is used to extract the object to a temp file + * and so we need to save off the path in obj->path for the caller to use. + */ if (NULL != obj->path) { obj->path = strdup(fullname); } @@ -1525,7 +1548,6 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t int dict_len = obj->stream - start; /* Dictionary should end where the stream begins */ const char *pstr; - struct pdf_dict *dparams = NULL; struct objstm_struct *objstm = NULL; int xref = 0; @@ -1582,7 +1604,10 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t length = obj->stream_size; if (0 == length) { cli_dbgmsg("pdf_extract_obj: Alleged or calculated stream length and stream buffer size both 0\n"); - goto done; /* Empty stream, nothing to scan */ + + /* Empty stream, nothing to scan */ + status = CL_SUCCESS; + goto done; } } @@ -1647,15 +1672,15 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms); if (!pdf->objstms) { cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms); - pdf_free_dict(dparams); - return CL_EMEM; + status = CL_EMEM; + goto done; } objstm = malloc(sizeof(struct objstm_struct)); if (!objstm) { cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms); - pdf_free_dict(dparams); - return CL_EMEM; + status = CL_EMEM; + goto done; } pdf->objstms[pdf->nobjstms - 1] = objstm; @@ -1673,18 +1698,18 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t } } - sum = pdf_decodestream(pdf, obj, dparams, obj->stream, (uint32_t)length, xref, fout, &rc, objstm); - if ((CL_SUCCESS != rc) && (CL_VIRUS != rc)) { - cli_dbgmsg("Error decoding stream! Error code: %d\n", rc); + sum = pdf_decodestream(pdf, obj, dparams, obj->stream, (uint32_t)length, xref, fout, &status, objstm); + if ((CL_SUCCESS != status) && (CL_VIRUS != status)) { + cli_dbgmsg("Error decoding stream! Error code: %d\n", status); /* It's ok if we couldn't decode the stream, * make a best effort to keep parsing... * Unless we were unable to allocate memory.*/ - if (CL_EMEM == rc) { - goto really_done; + if (CL_EMEM == status) { + goto done; } - if (CL_EPARSE == rc) { - rc = CL_SUCCESS; + if (CL_EPARSE == status) { + status = CL_SUCCESS; } if (NULL != objstm) { @@ -1713,7 +1738,8 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t if (!pdf->objstms) { cli_warnmsg("pdf_extract_obj: out of memory when shrinking down objstm array\n"); - return CL_EMEM; + status = CL_EMEM; + goto done; } } } else { @@ -1724,11 +1750,13 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t } } - if (dparams) + if (dparams) { pdf_free_dict(dparams); + dparams = NULL; + } - if (rc == CL_VIRUS) { - sum = 0; /* prevents post-filter scan */ + if (status == CL_VIRUS) { + /* skip post-filter scan */ goto done; } @@ -1741,7 +1769,7 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t off_t bytesleft = obj->size; if (bytesleft < 0) { - goto done; + goto scan_extracted_objects; } do { @@ -1789,7 +1817,7 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t pdf->stats.njs++; if (filter_writen(pdf, obj, fout, out, js_len, (size_t *)&sum) != js_len) { - rc = CL_EWRITE; + status = CL_EWRITE; free(js); break; } @@ -1824,64 +1852,81 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t off_t bytesleft = obj->size; if (bytesleft < 0) - rc = CL_EFORMAT; + status = CL_EFORMAT; else { if (obj->objstm) { - if (filter_writen(pdf, obj, fout, obj->objstm->streambuf + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft) - rc = CL_EWRITE; + if (filter_writen(pdf, obj, fout, obj->objstm->streambuf + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft) { + status = CL_EWRITE; + } } else { - if (filter_writen(pdf, obj, fout, pdf->map + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft) - rc = CL_EWRITE; + if (filter_writen(pdf, obj, fout, pdf->map + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft) { + status = CL_EWRITE; + } + } + } + } + +scan_extracted_objects: + + cli_dbgmsg("pdf_extract_obj: extracted %td bytes %u %u obj\n", sum, obj->id >> 8, obj->id & 0xff); + cli_dbgmsg("pdf_extract_obj: ... to %s\n", fullname); + + if ((flags & PDF_EXTRACT_OBJ_SCAN) && (sum > 0)) { + /* + * Scan the extracted objects for potential threats. + * PDF_EXTRACT_OBJ_SCAN is used when the extracted object should be scanned and then deleted. + */ + + /* TODO: invoke bytecode on this pdf obj with metainformation associated */ + lseek(fout, 0, SEEK_SET); + ret = cli_magic_scan_desc(fout, fullname, pdf->ctx, NULL, LAYER_ATTRIBUTES_NONE); + if (ret != CL_SUCCESS) { + status = ret; + goto done; + } + + if ((status == CL_CLEAN) || (status == CL_VIRUS)) { + ret = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout); + if (ret == CL_VIRUS) { + status = ret; + goto done; + } + } + + if (((status == CL_CLEAN) || (status == CL_VIRUS)) && (obj->flags & (1 << OBJ_CONTENTS))) { + lseek(fout, 0, SEEK_SET); + cli_dbgmsg("pdf_extract_obj: dumping contents from obj %u %u\n", obj->id >> 8, obj->id & 0xff); + + ret = pdf_scan_contents(fout, pdf, obj); + if (ret != CL_SUCCESS) { + status = ret; + goto done; } } } done: - cli_dbgmsg("pdf_extract_obj: extracted %td bytes %u %u obj\n", sum, obj->id >> 8, obj->id & 0xff); - cli_dbgmsg("pdf_extract_obj: ... to %s\n", fullname); + if (NULL != dparams) { + pdf_free_dict(dparams); + } - if (flags & PDF_EXTRACT_OBJ_SCAN && sum) { - int rc2; + if (-1 != fout) { + close(fout); + } - /* TODO: invoke bytecode on this pdf obj with metainformation associated */ - lseek(fout, 0, SEEK_SET); - rc2 = cli_magic_scan_desc(fout, fullname, pdf->ctx, NULL, LAYER_ATTRIBUTES_NONE); - if (rc2 != CL_SUCCESS) { - rc = rc2; - goto really_done; - } - - if ((rc == CL_CLEAN) || (rc == CL_VIRUS)) { - rc2 = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout); - if (rc2 == CL_VIRUS) { - rc = rc2; - goto really_done; - } - } - - if (((rc == CL_CLEAN) || (rc == CL_VIRUS)) && (obj->flags & (1 << OBJ_CONTENTS))) { - lseek(fout, 0, SEEK_SET); - cli_dbgmsg("pdf_extract_obj: dumping contents from obj %u %u\n", obj->id >> 8, obj->id & 0xff); - - rc2 = pdf_scan_contents(fout, pdf, obj); - if (rc2 != CL_SUCCESS) { - rc = rc2; - goto really_done; - } + if (extracted_an_object && (flags & PDF_EXTRACT_OBJ_SCAN) && !pdf->ctx->engine->keeptmp) { + /* + * When PDF_EXTRACT_OBJ_SCAN is set, the goal is to extract, scan, and delete it. + * If it was not set, we would keep it and the path is passed back obj->path for the caller to use. + * That's why we wouldn't unlink it here. + */ + if (cli_unlink(fullname) && status != CL_VIRUS) { + status = CL_EUNLINK; } } -really_done: - close(fout); - - if (CL_EMEM != rc) { - if (flags & PDF_EXTRACT_OBJ_SCAN && !pdf->ctx->engine->keeptmp) - if (cli_unlink(fullname) && rc != CL_VIRUS) - rc = CL_EUNLINK; - } - - return rc; + return status; } enum objstate { @@ -1893,6 +1938,7 @@ enum objstate { STATE_LINEARIZED, STATE_LAUNCHACTION, STATE_CONTENTS, + STATE_URI, STATE_ANY /* for actions table below */ }; @@ -1954,7 +2000,8 @@ static struct pdfname_action pdfname_actions[] = { {"Colors", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Colors_cb}, {"RichMedia", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, RichMedia_cb}, {"AcroForm", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, AcroForm_cb}, - {"XFA", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, XFA_cb}}; + {"XFA", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, XFA_cb}, + {"URI", OBJ_DICT, STATE_NONE, STATE_URI, NAMEFLAG_NONE, URI_cb}}; #define KNOWN_FILTERS ((1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_RL) | (1 << OBJ_FILTER_A85) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_LZW) | (1 << OBJ_FILTER_FAX) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_JPX) | (1 << OBJ_FILTER_CRYPT)) @@ -1963,12 +2010,24 @@ static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj, const ch struct pdfname_action *act = NULL; unsigned j; + // If we process STATE_S we will get duplicate URIs from the prior STATE_NONE + if (!strcmp(pdfname, "URI") && *state == STATE_S) { + *state = STATE_NONE; + return; + } + obj->statsflags |= OBJ_FLAG_PDFNAME_DONE; - for (j = 0; j < sizeof(pdfname_actions) / sizeof(pdfname_actions[0]); j++) { - if (!strcmp(pdfname, pdfname_actions[j].pdfname)) { - act = &pdfname_actions[j]; - break; + // Check to see if this object was observed to be a reference to a URI + if (obj->flags & (1 << OBJ_URI)) { + act = &(struct pdfname_action){"URI", OBJ_DICT, STATE_ANY, STATE_URI, NAMEFLAG_NONE, URI_cb}; + } + if (!act) { + for (j = 0; j < sizeof(pdfname_actions) / sizeof(pdfname_actions[0]); j++) { + if (!strcmp(pdfname, pdfname_actions[j].pdfname)) { + act = &pdfname_actions[j]; + break; + } } } @@ -2101,7 +2160,7 @@ static void pdf_parse_trailer(struct pdf_struct *pdf, const char *s, long length void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) { /* enough to hold common pdf names, we don't need all the names */ - char pdfname[64]; + char pdfname[64] = {0}; const char *q2, *q3; const char *nextobj = NULL, *nextopen = NULL, *nextclose = NULL; const char *q = NULL; @@ -2382,7 +2441,10 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) if (objstate == STATE_LAUNCHACTION) pdfobj_flag(pdf, obj, HAS_LAUNCHACTION); - if (dict_length > 0 && (objstate == STATE_JAVASCRIPT || objstate == STATE_OPENACTION || objstate == STATE_CONTENTS)) { + if (dict_length > 0 && (objstate == STATE_JAVASCRIPT || + objstate == STATE_OPENACTION || + objstate == STATE_CONTENTS || + objstate == STATE_URI)) { off_t dict_remaining = dict_length; if (objstate == STATE_OPENACTION) @@ -2447,6 +2509,9 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) case STATE_CONTENTS: flag = OBJ_CONTENTS; break; + case STATE_URI: + flag = OBJ_URI; + break; default: cli_dbgmsg("pdf_parseobj: Unexpected object type\n"); return; @@ -4669,6 +4734,78 @@ static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam cli_jsonint_array(colorsobj, obj->id >> 8); } +static void URI_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) +{ + cli_ctx *ctx = NULL; + off_t bytesleft = 0; + char *uri_start = NULL; + char *uri_heap = NULL; + const char *objstart = NULL; + json_object *uriarr = NULL; + + UNUSEDPARAM(act); + + if (!(pdf) || !(pdf->ctx) || !(pdf->ctx->wrkproperty) || !obj) { + return; + } + + objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) + : (const char *)(obj->start + pdf->map); + ctx = pdf->ctx; + + if (!(SCAN_COLLECT_METADATA) || !(SCAN_STORE_PDF_URIS)) { + return; + } + + if (obj->size == 0) { + return; + } + + if (obj->objstm) { + bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start); + } else { + bytesleft = MIN(obj->size, pdf->size - obj->start); + } + + // Advance forward to the first '(' character + size_t start = 0; + while (bytesleft > 0 && objstart[start] != '(') { + start++; + bytesleft--; + } + if (bytesleft == 0) { + return; + } + // The first character past '(' is the start of the URI + uri_start = (char *)(objstart + start + 1); + bytesleft--; + + // Advance forward to the first ')' character + size_t end = 0; + while (bytesleft > 0 && uri_start[end] != ')') { + end++; + bytesleft--; + } + if (uri_start[end] != ')') { + return; + } + + // Create a new string containing only the URI + CLI_MAX_MALLOC_OR_GOTO_DONE(uri_heap, end + 1, + cli_errmsg("cli_pdf: malloc() failed (URI)\n")); + strncpy(uri_heap, uri_start, end); + uri_heap[end] = '\0'; + + uriarr = cli_jsonarray(pdf->ctx->wrkproperty, "URIs"); + if (!uriarr) { + cli_errmsg("cli_pdf: malloc() failed (URI array)\n"); + goto done; + } + cli_jsonstr(uriarr, NULL, uri_heap); +done: + free(uri_heap); +} + static void pdf_free_stats(struct pdf_struct *pdf) { diff --git a/libclamav/scanners.c b/libclamav/scanners.c index b32eeeca0..44bccdc16 100644 --- a/libclamav/scanners.c +++ b/libclamav/scanners.c @@ -2082,7 +2082,7 @@ done: return ret; } -const char *const HTML_URLS_JSON_KEY = "HTMLUrls"; +const char *const HTML_URIS_JSON_KEY = "URIs"; /* https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml */ const char *URI_LIST[] = { "aaa://", @@ -2495,7 +2495,7 @@ static void save_urls(cli_ctx *ctx, tag_arguments_t *hrefs, form_data_t *form_da return; } - if (!(SCAN_STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL))) { + if (!(SCAN_STORE_HTML_URIS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL))) { return; } @@ -2503,9 +2503,9 @@ static void save_urls(cli_ctx *ctx, tag_arguments_t *hrefs, form_data_t *form_da for (i = 0; i < hrefs->count; i++) { if (is_url((const char *)hrefs->value[i], strlen((const char *)hrefs->value[i]))) { if (NULL == ary) { - ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY); + ary = cli_jsonarray(ctx->wrkproperty, HTML_URIS_JSON_KEY); if (!ary) { - cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY); + cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URIS_JSON_KEY); return; } } @@ -2517,9 +2517,9 @@ static void save_urls(cli_ctx *ctx, tag_arguments_t *hrefs, form_data_t *form_da for (i = 0; i < (int)form_data->count; i++) { if (is_url((const char *)form_data->urls[i], strlen((const char *)form_data->urls[i]))) { if (NULL == ary) { - ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY); + ary = cli_jsonarray(ctx->wrkproperty, HTML_URIS_JSON_KEY); if (!ary) { - cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY); + cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URIS_JSON_KEY); return; } } @@ -2560,7 +2560,7 @@ static cl_error_t cli_scanhtml(cli_ctx *ctx) cli_dbgmsg("cli_scanhtml: using tempdir %s\n", tempname); /* Output JSON Summary Information */ - if (SCAN_STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL)) { + if (SCAN_STORE_HTML_URIS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL)) { tag_arguments_t hrefs = {0}; hrefs.scanContents = 1; form_data_t form_data = {0}; @@ -4311,7 +4311,7 @@ static cl_error_t scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_fi free_duplicate_fmap(new_map); } } // end check for embedded files - } // end if (fpt->offset > 0) + } // end if (fpt->offset > 0) if ((nret == CL_EMEM) || (ctx->abort_scan) || diff --git a/unit_tests/clamscan/save_html_urls_test.py b/unit_tests/clamscan/save_html_uris_test.py similarity index 88% rename from unit_tests/clamscan/save_html_urls_test.py rename to unit_tests/clamscan/save_html_uris_test.py index d7e0993bf..6ffeddd09 100644 --- a/unit_tests/clamscan/save_html_urls_test.py +++ b/unit_tests/clamscan/save_html_uris_test.py @@ -39,7 +39,7 @@ class TC(testcase.TestCase): tempdir=self.path_tmp / "TD" if not os.path.isdir(tempdir): - os.makedirs(tempdir); + os.makedirs(tempdir) testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'html' / 'index.html' command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format( @@ -52,8 +52,9 @@ class TC(testcase.TestCase): assert output.ec == 0 # clean - expected_strings = [ 'HTMLUrls' - , '"https://www.clamav.net/reports/malware"' - , '"http://www.google.com"' - ] + expected_strings = [ + 'URIs', + '"https://www.clamav.net/reports/malware"', + '"http://www.google.com"' + ] self.verify_metadata_json(tempdir, expected_strings) diff --git a/unit_tests/clamscan/save_pdf_uris_test.py b/unit_tests/clamscan/save_pdf_uris_test.py new file mode 100644 index 000000000..df6466fe2 --- /dev/null +++ b/unit_tests/clamscan/save_pdf_uris_test.py @@ -0,0 +1,85 @@ +# Copyright (C) 2020-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved. + +""" +Run clamscan tests. +""" + +import sys +import os +import re +import shutil + +sys.path.append('../unit_tests') +import testcase + + +class TC(testcase.TestCase): + @classmethod + def setUpClass(cls): + super(TC, cls).setUpClass() + + @classmethod + def tearDownClass(cls): + super(TC, cls).tearDownClass() + + def setUp(self): + super(TC, self).setUp() + + def tearDown(self): + super(TC, self).tearDown() + + # Remove scan temps directory between tests + if (self.path_tmp / "TD").exists(): + shutil.rmtree(self.path_tmp / "TD") + + self.verify_valgrind_log() + + def test_save_links(self): + self.step_name('Extract Links') + + tempdir=self.path_tmp / "TD" + if not os.path.isdir(tempdir): + os.makedirs(tempdir) + + testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'pdf' / 'uri-and-ref.pdf' + command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format( + valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan, + path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb', + tempdir=tempdir, + testfile=testfile, + ) + output = self.execute_command(command) + + assert output.ec == 0 # clean + + expected_strings = [ + 'URIs', + '"https://docs.clamav.net/manual/Development.html"', + '"https://docs.clamav.net/"' + ] + self.verify_metadata_json(tempdir, expected_strings) + + def test_out_of_order_links(self): + self.step_name('Out-of-Order Links') + + tempdir=self.path_tmp / "TD" + if not os.path.isdir(tempdir): + os.makedirs(tempdir) + + testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'pdf' / 'out-of-order.pdf' + command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format( + valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan, + path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb', + tempdir=tempdir, + testfile=testfile, + ) + output = self.execute_command(command) + + assert output.ec == 0 # clean + + expected_strings = [ + 'URIs', + '"https://docs.clamav.net/manual/Development.html"', + '"https://docs.clamav.net/"' + ] + self.verify_metadata_json(tempdir, expected_strings) diff --git a/unit_tests/input/other_scanfiles/pdf/out-of-order.pdf b/unit_tests/input/other_scanfiles/pdf/out-of-order.pdf new file mode 100644 index 000000000..9a1317a03 --- /dev/null +++ b/unit_tests/input/other_scanfiles/pdf/out-of-order.pdf @@ -0,0 +1,62 @@ +%PDF-1.4 +1 0 obj +<< /Type /Catalog +/Outlines 2 0 R +/Pages 3 0 R +>> +endobj +2 0 obj +<< /Type Outlines +/Count 0 +>> +endobj +3 0 obj +<< /Type /Pages +/Kids [ 4 0 R ] +/Count 1 +>> +endobj +4 0 obj +<< /Type /Page +/Parent 3 0 R +/MediaBox [ 0 0 612 792 ] +/Contents 5 0 R +/Resources << /ProcSet 6 0 R >> +>> +endobj +26 0 obj +(https://docs.clamav.net/) +endobj +24 0 obj +<< /URI (https://docs.clamav.net/manual/Development.html) +/S /URI >> +endobj +25 0 obj +<< /Type /Action /S /URI /URI 26 0 R >> +endobj +5 0 obj +<< /Length 35 >> +stream Page-marking operators endstream +endobj +6 0 obj +[ /PDF ] +endobj +xref +0 7 +0000000000 65535 f +0000000009 00000 n +0000000074 00000 n +0000000119 00000 n +0000000178 00000 n +0000000299 00000 n +0000000384 00000 n +0000000440 00000 n +0000000483 00000 n +0000000555 00000 n +trailer +<< /Size 7 +/Root 1 0 R +>> +startxref +408 +%%EOF \ No newline at end of file diff --git a/unit_tests/input/other_scanfiles/pdf/uri-and-ref.pdf b/unit_tests/input/other_scanfiles/pdf/uri-and-ref.pdf new file mode 100644 index 000000000..739fe2c71 --- /dev/null +++ b/unit_tests/input/other_scanfiles/pdf/uri-and-ref.pdf @@ -0,0 +1,62 @@ +%PDF-1.4 +1 0 obj +<< /Type /Catalog +/Outlines 2 0 R +/Pages 3 0 R +>> +endobj +2 0 obj +<< /Type Outlines +/Count 0 +>> +endobj +3 0 obj +<< /Type /Pages +/Kids [ 4 0 R ] +/Count 1 +>> +endobj +4 0 obj +<< /Type /Page +/Parent 3 0 R +/MediaBox [ 0 0 612 792 ] +/Contents 5 0 R +/Resources << /ProcSet 6 0 R >> +>> +endobj +24 0 obj +<< /URI (https://docs.clamav.net/manual/Development.html) +/S /URI >> +endobj +25 0 obj +<< /Type /Action /S /URI /URI 26 0 R >> +endobj +26 0 obj +(https://docs.clamav.net/) +endobj +5 0 obj +<< /Length 35 >> +stream Page-marking operators endstream +endobj +6 0 obj +[ /PDF ] +endobj +xref +0 7 +0000000000 65535 f +0000000009 00000 n +0000000074 00000 n +0000000119 00000 n +0000000178 00000 n +0000000299 00000 n +0000000384 00000 n +0000000440 00000 n +0000000483 00000 n +0000000555 00000 n +trailer +<< /Size 7 +/Root 1 0 R +>> +startxref +408 +%%EOF \ No newline at end of file diff --git a/win32/conf_examples/clamd.conf.sample b/win32/conf_examples/clamd.conf.sample index 580afe0ea..17a4a1625 100644 --- a/win32/conf_examples/clamd.conf.sample +++ b/win32/conf_examples/clamd.conf.sample @@ -226,11 +226,17 @@ TCPAddr localhost # Default: no #GenerateMetadataJson yes -# Store URLs found in html files to the json metadata. -# URLs will be stored in an array with the tag 'HTMLUrls' +# Store URIs found in html files to the json metadata. +# URIs will be stored in an array with the tag 'URIs' # GenerateMetadataJson is required for this feature. # Default: yes (if GenerateMetadataJson is used) -#JsonStoreHTMLUrls no +#JsonStoreHTMLURIs no + +# Store URIs found in pdf files to the json metadata. +# URIs will be stored in an array with the tag 'URIs' +# GenerateMetadataJson is required for this feature. +# Default: yes (if GenerateMetadataJson is used) +#JsonStorePDFURIs no # Permit use of the ALLMATCHSCAN command. If set to no, clamd will reject # any ALLMATCHSCAN command as invalid.