diff --git a/libclamav/pdf.c b/libclamav/pdf.c index a56ddd24b..78998c6f2 100644 --- a/libclamav/pdf.c +++ b/libclamav/pdf.c @@ -87,6 +87,7 @@ static void pdf_export_json(struct pdf_struct *); static void ASCIIHexDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void ASCII85Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); +static void AutomaticAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); static void EmbeddedFile_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void FlateDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Image_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); @@ -1652,12 +1653,13 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t * is an object stream. If so, collect the relevant info. */ dict_len = obj->stream - start; - if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm"))) { + if (NULL != (pstr = pdf_getdict(start, &dict_len, "/ObjStm"))) { int objstm_first = -1; int objstm_length = -1; int objstm_n = -1; - cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n"); + cli_dbgmsg("pdf_extract_obj: Found /ObjStm\n"); + pdf->stats.nobjstream++; dict_len = obj->stream - start; if (-1 == (objstm_first = pdf_readint(start, dict_len, "/First"))) { @@ -1668,14 +1670,17 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n"); } else { /* Add objstm to pdf struct, so it can be freed eventually */ - pdf->nobjstms++; - pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms); + if (!pdf->objstms) { + pdf->objstms = malloc(sizeof(struct objstm_struct *)); + } else { + pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * (pdf->nobjstms + 1)); + } if (!pdf->objstms) { cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms); status = CL_EMEM; goto done; } - + pdf->nobjstms++; objstm = malloc(sizeof(struct objstm_struct)); if (!objstm) { cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms); @@ -1955,6 +1960,7 @@ struct pdfname_action { }; static struct pdfname_action pdfname_actions[] = { + {"AA", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, AutomaticAction_cb}, {"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCIIHexDecode_cb}, {"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb}, {"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb}, @@ -2139,7 +2145,7 @@ static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len) static void pdf_parse_trailer(struct pdf_struct *pdf, const char *s, long length) { const char *enc; - + pdf->stats.ntrailer++; enc = cli_memstr(s, length, "/Encrypt", 8); if (enc) { char *newID; @@ -2223,6 +2229,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) if ((CL_SUCCESS == has_stream) || (CL_EFORMAT == has_stream)) { /* Stream found. Store this fact and the stream bounds. */ + pdf->stats.nstream++; cli_dbgmsg("pdf_parseobj: %u %u contains stream, size: %zu\n", obj->id >> 8, obj->id & 0xff, stream_size); obj->flags |= (1 << OBJ_STREAM); obj->stream = stream; @@ -3902,6 +3909,8 @@ cl_error_t cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) if (!q || xrefCheck(q, q + bytesleft) == -1) { cli_dbgmsg("cli_pdf: did not find valid xref\n"); pdf.flags |= 1 << BAD_PDF_TRAILER; + } else { + pdf.stats.nxref++; } } } @@ -4564,35 +4573,61 @@ static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna } } -static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) -{ - UNUSEDPARAM(obj); - UNUSEDPARAM(act); - - if (NULL == pdf) - return; - - pdf->stats.nrichmedia++; -} - static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); - if (NULL == pdf) + cli_ctx *ctx = pdf->ctx; + + if (!(pdf) || !(pdf->ctx->wrkproperty) || !(SCAN_COLLECT_METADATA)) { return; + } pdf->stats.nacroform++; } +static void AutomaticAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) +{ + UNUSEDPARAM(obj); + UNUSEDPARAM(act); + char *p1 = NULL; + const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) + : (const char *)(obj->start + pdf->map); + cli_ctx *ctx = pdf->ctx; + + if (!(pdf) || !(pdf->ctx->wrkproperty) || !(SCAN_COLLECT_METADATA)) { + return; + } + + // ToDO: Find a way to not count references to the same automatic action multiple times + pdf->stats.naa++; +} + +static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) +{ + UNUSEDPARAM(obj); + UNUSEDPARAM(act); + + cli_ctx *ctx = pdf->ctx; + + if (!(pdf) || !(pdf->ctx->wrkproperty) || !(SCAN_COLLECT_METADATA)) { + return; + } + + pdf->stats.nrichmedia++; +} + static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); - if (NULL == pdf) + cli_ctx *ctx = pdf->ctx; + + if (!(pdf) || !(pdf->ctx->wrkproperty) || !(SCAN_COLLECT_METADATA)) { return; + } pdf->stats.nxfa++; } @@ -4761,6 +4796,8 @@ static void URI_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_a return; } + pdf->stats.nuri++; + if (obj->objstm) { bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start); } else { @@ -5118,8 +5155,22 @@ static void pdf_export_json(struct pdf_struct *pdf) cli_jsonint(pdfobj, "RichMediaCount", pdf->stats.nrichmedia); if (pdf->stats.nacroform) cli_jsonint(pdfobj, "AcroFormCount", pdf->stats.nacroform); - if (pdf->stats.nxfa) + if (pdf->stats.nacroform) cli_jsonint(pdfobj, "XFACount", pdf->stats.nxfa); + if (pdf->stats.naa) + cli_jsonint(pdfobj, "AutomaticActionCount", pdf->stats.naa); + if (pdf->stats.nstream) + cli_jsonint(pdfobj, "StreamCount", pdf->stats.nstream); + if (pdf->nobjs) + cli_jsonint(pdfobj, "ObjectCount", pdf->nobjs); + if (pdf->stats.nobjstream) + cli_jsonint(pdfobj, "ObjectStreamCount", pdf->stats.nobjstream); + if (pdf->stats.ntrailer) + cli_jsonint(pdfobj, "TrailerCount", pdf->stats.ntrailer); + if (pdf->stats.nuri) + cli_jsonint(pdfobj, "URICount", pdf->stats.nuri); + if (pdf->stats.nxref) + cli_jsonint(pdfobj, "XRefCount", pdf->stats.nxref); if (pdf->flags & (1 << BAD_PDF_VERSION)) cli_jsonbool(pdfobj, "BadVersion", 1); if (pdf->flags & (1 << BAD_PDF_HEADERPOS)) diff --git a/libclamav/pdf.h b/libclamav/pdf.h index b3c928f2c..88b344749 100644 --- a/libclamav/pdf.h +++ b/libclamav/pdf.h @@ -102,39 +102,45 @@ struct pdf_stats_entry { }; struct pdf_stats { - int32_t ninvalidobjs; /* Number of invalid objects */ - int32_t njs; /* Number of javascript objects */ - int32_t nflate; /* Number of flate-encoded objects */ + int32_t naa; /* Number of Automatic Action objects */ + int32_t nacroform; /* Number of AcroForm objects */ int32_t nactivex; /* Number of ActiveX objects */ - int32_t nflash; /* Number of flash objects */ - int32_t ncolors; /* Number of colors */ - int32_t nasciihexdecode; /* Number of ASCIIHexDecode-filtered objects */ int32_t nascii85decode; /* Number of ASCII85Decode-filtered objects */ - int32_t nembeddedfile; /* Number of embedded files */ - int32_t nimage; /* Number of image objects */ - int32_t nlzw; /* Number of LZW-filtered objects */ - int32_t nrunlengthdecode; /* Number of RunLengthDecode-filtered objects */ - int32_t nfaxdecode; /* Number of CCITT-filtered objects */ - int32_t njbig2decode; /* Number of JBIG2Decode-filtered objects */ - int32_t ndctdecode; /* Number of DCTDecode-filtered objects */ - int32_t njpxdecode; /* Number of JPXDecode-filtered objects */ + int32_t nasciihexdecode; /* Number of ASCIIHexDecode-filtered objects */ + int32_t ncolors; /* Number of colors */ int32_t ncrypt; /* Number of Crypt-filtered objects */ - int32_t nstandard; /* Number of Standard-filtered objects */ - int32_t nsigned; /* Number of Signed objects */ - int32_t nopenaction; /* Number of OpenAction objects */ + int32_t ndctdecode; /* Number of DCTDecode-filtered objects */ + int32_t nembeddedfile; /* Number of embedded files */ + int32_t nfaxdecode; /* Number of CCITT-filtered objects */ + int32_t nflash; /* Number of flash objects */ + int32_t nflate; /* Number of flate-encoded objects */ + int32_t nimage; /* Number of image objects */ + int32_t ninvalidobjs; /* Number of invalid objects */ + int32_t njbig2decode; /* Number of JBIG2Decode-filtered objects */ + int32_t njpxdecode; /* Number of JPXDecode-filtered objects */ + int32_t njs; /* Number of javascript objects */ int32_t nlaunch; /* Number of Launch objects */ + int32_t nlzw; /* Number of LZW-filtered objects */ + int32_t nobjstream; /* Number of object streams */ + int32_t nopenaction; /* Number of OpenAction objects */ int32_t npage; /* Number of Page objects */ int32_t nrichmedia; /* Number of RichMedia objects */ - int32_t nacroform; /* Number of AcroForm objects */ + int32_t nrunlengthdecode; /* Number of RunLengthDecode-filtered objects */ + int32_t nsigned; /* Number of Signed objects */ + int32_t nstandard; /* Number of Standard-filtered objects */ + int32_t nstream; /* Number of streams */ + int32_t ntrailer; /* Number of trailer objects */ + int32_t nuri; /* Number of URI objects */ int32_t nxfa; /* Number of XFA objects */ + int32_t nxref; /* Number of xref objects */ struct pdf_stats_entry *author; /* Author of the PDF */ - struct pdf_stats_entry *creator; /* Application used to create the PDF */ - struct pdf_stats_entry *producer; /* Application used to produce the PDF */ struct pdf_stats_entry *creationdate; /* Date the PDF was created */ - struct pdf_stats_entry *modificationdate; /* Date the PDF was modified */ - struct pdf_stats_entry *title; /* Title of the PDF */ - struct pdf_stats_entry *subject; /* Subject of the PDF */ + struct pdf_stats_entry *creator; /* Application used to create the PDF */ struct pdf_stats_entry *keywords; /* Keywords of the PDF */ + struct pdf_stats_entry *modificationdate; /* Date the PDF was modified */ + struct pdf_stats_entry *producer; /* Application used to produce the PDF */ + struct pdf_stats_entry *subject; /* Subject of the PDF */ + struct pdf_stats_entry *title; /* Title of the PDF */ }; enum enc_method { diff --git a/unit_tests/clamscan/pdf_stats_test.py b/unit_tests/clamscan/pdf_stats_test.py new file mode 100644 index 000000000..a40c7691e --- /dev/null +++ b/unit_tests/clamscan/pdf_stats_test.py @@ -0,0 +1,72 @@ +# Copyright (C) 2020-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved. + +""" +Run clamscan tests. +""" + +import sys +import os +import re +import shutil + +sys.path.append('../unit_tests') +import testcase + + +class TC(testcase.TestCase): + @classmethod + def setUpClass(cls): + super(TC, cls).setUpClass() + + @classmethod + def tearDownClass(cls): + super(TC, cls).tearDownClass() + + def setUp(self): + super(TC, self).setUp() + + def tearDown(self): + super(TC, self).tearDown() + + # Remove scan temps directory between tests + if (self.path_tmp / "TD").exists(): + shutil.rmtree(self.path_tmp / "TD") + + self.verify_valgrind_log() + + def test_pdf_stats(self): + self.step_name('Test PDF Stats') + + tempdir=self.path_tmp / "TD" + if not os.path.isdir(tempdir): + os.makedirs(tempdir) + + testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'pdf' / 'pdf-stats-test.pdf' + command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format( + valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan, + path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb', + tempdir=tempdir, + testfile=testfile, + ) + output = self.execute_command(command) + + assert output.ec == 0 # clean + + expected_strings = [ + '"JavaScriptObjectCount":1,', + '"EmbeddedFileCount":2,', + '"JBIG2DecodeCount":2,', + '"OpenActionCount":2,', + '"LaunchCount":2,', + '"PageCount":2,', + '"RichMediaCount":2,', + '"AcroFormCount":2,', + '"XFACount":2,', + '"AutomaticActionCount":2,', + '"StreamCount":7,', + '"ObjectCount":16,', + '"ObjectStreamCount":1,', + '"TrailerCount":1,', + '"XRefCount":1' + ] + self.verify_metadata_json(tempdir, expected_strings) diff --git a/unit_tests/input/other_scanfiles/pdf/pdf-stats-test.pdf b/unit_tests/input/other_scanfiles/pdf/pdf-stats-test.pdf new file mode 100644 index 000000000..fd85573ba --- /dev/null +++ b/unit_tests/input/other_scanfiles/pdf/pdf-stats-test.pdf @@ -0,0 +1,167 @@ +%PDF-1.7 +1 0 obj +<< + /Type /Catalog + /Pages 2 0 R + /OpenAction 5 0 R + /Launch 6 0 R + /EmbeddedFile 7 0 R + /AcroForm 8 0 R + /ObjStm 9 0 R + /JBIG2Decode 10 0 R + /RichMedia 11 0 R + /XFA 12 0 R + /AA 15 0 R +>> +endobj + +2 0 obj +<< + /Type /Pages + /Count 2 + /Kids [3 0 R 4 0 R] +>> +endobj + +3 0 obj +<< + /Type /Page + /Parent 2 0 R + /Contents 13 0 R + /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> >> >> +>> +endobj + +4 0 obj +<< + /Type /Page + /Parent 2 0 R + /Contents 14 0 R + /OpenAction 5 0 R + /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> >> >> +>> +endobj + +5 0 obj +<< + /S /JavaScript + /JS (alert("Hello from JS")) +>> +endobj + +6 0 obj +<< + /S /Launch + /F (dummy.exe) +>> +endobj + +7 0 obj +<< + /Type /EmbeddedFile + /Length 11 +>> +stream +HelloWorld +endstream +endobj + +8 0 obj +<< + /Type /AcroForm + /Fields [] +>> +endobj + +9 0 obj +<< + /Type /ObjStm + /N 1 + /First 4 + /Length 30 +>> +stream +17 0 << /Test /ObjStmEmbedded >> +endstream +endobj + +10 0 obj +<< + /Filter /JBIG2Decode + /Length 9 +>> +stream +JBIG2DATA +endstream +endobj + +11 0 obj +<< + /Type /RichMedia + /Length 9 +>> +stream +RichMedia +endstream +endobj + +12 0 obj +<< + /Type /XFA + /Length 3 +>> +stream +XFA +endstream +endobj + +13 0 obj +<< + /Length 37 +>> +stream +BT /F1 24 Tf 100 700 Td (Test 1) Tj ET +endstream +endobj + +14 0 obj +<< + /Length 37 +>> +stream +BT /F1 24 Tf 100 700 Td (Test 2) Tj ET +endstream +endobj + +15 0 obj +<< + /AA << /O 5 0 R >> +>> +endobj + +xref +0 17 +0000000000 65535 f +0000000009 00000 n +0000000232 00000 n +0000000305 00000 n +0000000462 00000 n +0000000639 00000 n +0000000708 00000 n +0000000760 00000 n +0000000845 00000 n +0000000899 00000 n +0000001009 00000 n +0000001095 00000 n +0000001176 00000 n +0000001245 00000 n +0000001335 00000 n +0000001425 00000 n +trailer +<< + /Size 17 + /Root 1 0 R +>> +startxref +1478 +%%EOF \ No newline at end of file