libclamav: Add PDF stats for metadata collection.

More PDF statistics were requested for feature parity.
If metadata collection is enabled, the following additional PDF
statistics will be collected:
  - Number of Automatic Actions
  - Number of Streams
  - Number of Objects
  - Number of Object Streams
  - Number of Trailers
  - Number of URIs
  - Number of Xrefs

Additionally, some of the parsing logic was fixed during testing of
these features.

CLAM-2820
This commit is contained in:
John Humlick 2025-07-30 17:08:05 -07:00
parent f963dc2d9e
commit c4c9eb5c5e
No known key found for this signature in database
GPG key ID: 543839C3681B11C1
4 changed files with 339 additions and 43 deletions

View file

@ -87,6 +87,7 @@ static void pdf_export_json(struct pdf_struct *);
static void ASCIIHexDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void ASCII85Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void AutomaticAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
static void EmbeddedFile_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void FlateDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Image_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
@ -1652,12 +1653,13 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
* is an object stream. If so, collect the relevant info.
*/
dict_len = obj->stream - start;
if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm"))) {
if (NULL != (pstr = pdf_getdict(start, &dict_len, "/ObjStm"))) {
int objstm_first = -1;
int objstm_length = -1;
int objstm_n = -1;
cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n");
cli_dbgmsg("pdf_extract_obj: Found /ObjStm\n");
pdf->stats.nobjstream++;
dict_len = obj->stream - start;
if (-1 == (objstm_first = pdf_readint(start, dict_len, "/First"))) {
@ -1668,14 +1670,17 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n");
} else {
/* Add objstm to pdf struct, so it can be freed eventually */
pdf->nobjstms++;
pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms);
if (!pdf->objstms) {
pdf->objstms = malloc(sizeof(struct objstm_struct *));
} else {
pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * (pdf->nobjstms + 1));
}
if (!pdf->objstms) {
cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
status = CL_EMEM;
goto done;
}
pdf->nobjstms++;
objstm = malloc(sizeof(struct objstm_struct));
if (!objstm) {
cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
@ -1955,6 +1960,7 @@ struct pdfname_action {
};
static struct pdfname_action pdfname_actions[] = {
{"AA", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, AutomaticAction_cb},
{"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCIIHexDecode_cb},
{"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb},
{"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb},
@ -2139,7 +2145,7 @@ static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len)
static void pdf_parse_trailer(struct pdf_struct *pdf, const char *s, long length)
{
const char *enc;
pdf->stats.ntrailer++;
enc = cli_memstr(s, length, "/Encrypt", 8);
if (enc) {
char *newID;
@ -2223,6 +2229,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
if ((CL_SUCCESS == has_stream) ||
(CL_EFORMAT == has_stream)) {
/* Stream found. Store this fact and the stream bounds. */
pdf->stats.nstream++;
cli_dbgmsg("pdf_parseobj: %u %u contains stream, size: %zu\n", obj->id >> 8, obj->id & 0xff, stream_size);
obj->flags |= (1 << OBJ_STREAM);
obj->stream = stream;
@ -3902,6 +3909,8 @@ cl_error_t cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
if (!q || xrefCheck(q, q + bytesleft) == -1) {
cli_dbgmsg("cli_pdf: did not find valid xref\n");
pdf.flags |= 1 << BAD_PDF_TRAILER;
} else {
pdf.stats.nxref++;
}
}
}
@ -4564,35 +4573,61 @@ static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
}
}
static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
return;
pdf->stats.nrichmedia++;
}
static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
cli_ctx *ctx = pdf->ctx;
if (!(pdf) || !(pdf->ctx->wrkproperty) || !(SCAN_COLLECT_METADATA)) {
return;
}
pdf->stats.nacroform++;
}
static void AutomaticAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
char *p1 = NULL;
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
cli_ctx *ctx = pdf->ctx;
if (!(pdf) || !(pdf->ctx->wrkproperty) || !(SCAN_COLLECT_METADATA)) {
return;
}
// ToDO: Find a way to not count references to the same automatic action multiple times
pdf->stats.naa++;
}
static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
cli_ctx *ctx = pdf->ctx;
if (!(pdf) || !(pdf->ctx->wrkproperty) || !(SCAN_COLLECT_METADATA)) {
return;
}
pdf->stats.nrichmedia++;
}
static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
if (NULL == pdf)
cli_ctx *ctx = pdf->ctx;
if (!(pdf) || !(pdf->ctx->wrkproperty) || !(SCAN_COLLECT_METADATA)) {
return;
}
pdf->stats.nxfa++;
}
@ -4761,6 +4796,8 @@ static void URI_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_a
return;
}
pdf->stats.nuri++;
if (obj->objstm) {
bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start);
} else {
@ -5118,8 +5155,22 @@ static void pdf_export_json(struct pdf_struct *pdf)
cli_jsonint(pdfobj, "RichMediaCount", pdf->stats.nrichmedia);
if (pdf->stats.nacroform)
cli_jsonint(pdfobj, "AcroFormCount", pdf->stats.nacroform);
if (pdf->stats.nxfa)
if (pdf->stats.nacroform)
cli_jsonint(pdfobj, "XFACount", pdf->stats.nxfa);
if (pdf->stats.naa)
cli_jsonint(pdfobj, "AutomaticActionCount", pdf->stats.naa);
if (pdf->stats.nstream)
cli_jsonint(pdfobj, "StreamCount", pdf->stats.nstream);
if (pdf->nobjs)
cli_jsonint(pdfobj, "ObjectCount", pdf->nobjs);
if (pdf->stats.nobjstream)
cli_jsonint(pdfobj, "ObjectStreamCount", pdf->stats.nobjstream);
if (pdf->stats.ntrailer)
cli_jsonint(pdfobj, "TrailerCount", pdf->stats.ntrailer);
if (pdf->stats.nuri)
cli_jsonint(pdfobj, "URICount", pdf->stats.nuri);
if (pdf->stats.nxref)
cli_jsonint(pdfobj, "XRefCount", pdf->stats.nxref);
if (pdf->flags & (1 << BAD_PDF_VERSION))
cli_jsonbool(pdfobj, "BadVersion", 1);
if (pdf->flags & (1 << BAD_PDF_HEADERPOS))

View file

@ -102,39 +102,45 @@ struct pdf_stats_entry {
};
struct pdf_stats {
int32_t ninvalidobjs; /* Number of invalid objects */
int32_t njs; /* Number of javascript objects */
int32_t nflate; /* Number of flate-encoded objects */
int32_t naa; /* Number of Automatic Action objects */
int32_t nacroform; /* Number of AcroForm objects */
int32_t nactivex; /* Number of ActiveX objects */
int32_t nflash; /* Number of flash objects */
int32_t ncolors; /* Number of colors */
int32_t nasciihexdecode; /* Number of ASCIIHexDecode-filtered objects */
int32_t nascii85decode; /* Number of ASCII85Decode-filtered objects */
int32_t nembeddedfile; /* Number of embedded files */
int32_t nimage; /* Number of image objects */
int32_t nlzw; /* Number of LZW-filtered objects */
int32_t nrunlengthdecode; /* Number of RunLengthDecode-filtered objects */
int32_t nfaxdecode; /* Number of CCITT-filtered objects */
int32_t njbig2decode; /* Number of JBIG2Decode-filtered objects */
int32_t ndctdecode; /* Number of DCTDecode-filtered objects */
int32_t njpxdecode; /* Number of JPXDecode-filtered objects */
int32_t nasciihexdecode; /* Number of ASCIIHexDecode-filtered objects */
int32_t ncolors; /* Number of colors */
int32_t ncrypt; /* Number of Crypt-filtered objects */
int32_t nstandard; /* Number of Standard-filtered objects */
int32_t nsigned; /* Number of Signed objects */
int32_t nopenaction; /* Number of OpenAction objects */
int32_t ndctdecode; /* Number of DCTDecode-filtered objects */
int32_t nembeddedfile; /* Number of embedded files */
int32_t nfaxdecode; /* Number of CCITT-filtered objects */
int32_t nflash; /* Number of flash objects */
int32_t nflate; /* Number of flate-encoded objects */
int32_t nimage; /* Number of image objects */
int32_t ninvalidobjs; /* Number of invalid objects */
int32_t njbig2decode; /* Number of JBIG2Decode-filtered objects */
int32_t njpxdecode; /* Number of JPXDecode-filtered objects */
int32_t njs; /* Number of javascript objects */
int32_t nlaunch; /* Number of Launch objects */
int32_t nlzw; /* Number of LZW-filtered objects */
int32_t nobjstream; /* Number of object streams */
int32_t nopenaction; /* Number of OpenAction objects */
int32_t npage; /* Number of Page objects */
int32_t nrichmedia; /* Number of RichMedia objects */
int32_t nacroform; /* Number of AcroForm objects */
int32_t nrunlengthdecode; /* Number of RunLengthDecode-filtered objects */
int32_t nsigned; /* Number of Signed objects */
int32_t nstandard; /* Number of Standard-filtered objects */
int32_t nstream; /* Number of streams */
int32_t ntrailer; /* Number of trailer objects */
int32_t nuri; /* Number of URI objects */
int32_t nxfa; /* Number of XFA objects */
int32_t nxref; /* Number of xref objects */
struct pdf_stats_entry *author; /* Author of the PDF */
struct pdf_stats_entry *creator; /* Application used to create the PDF */
struct pdf_stats_entry *producer; /* Application used to produce the PDF */
struct pdf_stats_entry *creationdate; /* Date the PDF was created */
struct pdf_stats_entry *modificationdate; /* Date the PDF was modified */
struct pdf_stats_entry *title; /* Title of the PDF */
struct pdf_stats_entry *subject; /* Subject of the PDF */
struct pdf_stats_entry *creator; /* Application used to create the PDF */
struct pdf_stats_entry *keywords; /* Keywords of the PDF */
struct pdf_stats_entry *modificationdate; /* Date the PDF was modified */
struct pdf_stats_entry *producer; /* Application used to produce the PDF */
struct pdf_stats_entry *subject; /* Subject of the PDF */
struct pdf_stats_entry *title; /* Title of the PDF */
};
enum enc_method {

View file

@ -0,0 +1,72 @@
# Copyright (C) 2020-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
"""
Run clamscan tests.
"""
import sys
import os
import re
import shutil
sys.path.append('../unit_tests')
import testcase
class TC(testcase.TestCase):
@classmethod
def setUpClass(cls):
super(TC, cls).setUpClass()
@classmethod
def tearDownClass(cls):
super(TC, cls).tearDownClass()
def setUp(self):
super(TC, self).setUp()
def tearDown(self):
super(TC, self).tearDown()
# Remove scan temps directory between tests
if (self.path_tmp / "TD").exists():
shutil.rmtree(self.path_tmp / "TD")
self.verify_valgrind_log()
def test_pdf_stats(self):
self.step_name('Test PDF Stats')
tempdir=self.path_tmp / "TD"
if not os.path.isdir(tempdir):
os.makedirs(tempdir)
testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'pdf' / 'pdf-stats-test.pdf'
command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format(
valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb',
tempdir=tempdir,
testfile=testfile,
)
output = self.execute_command(command)
assert output.ec == 0 # clean
expected_strings = [
'"JavaScriptObjectCount":1,',
'"EmbeddedFileCount":2,',
'"JBIG2DecodeCount":2,',
'"OpenActionCount":2,',
'"LaunchCount":2,',
'"PageCount":2,',
'"RichMediaCount":2,',
'"AcroFormCount":2,',
'"XFACount":2,',
'"AutomaticActionCount":2,',
'"StreamCount":7,',
'"ObjectCount":16,',
'"ObjectStreamCount":1,',
'"TrailerCount":1,',
'"XRefCount":1'
]
self.verify_metadata_json(tempdir, expected_strings)

View file

@ -0,0 +1,167 @@
%PDF-1.7
1 0 obj
<<
/Type /Catalog
/Pages 2 0 R
/OpenAction 5 0 R
/Launch 6 0 R
/EmbeddedFile 7 0 R
/AcroForm 8 0 R
/ObjStm 9 0 R
/JBIG2Decode 10 0 R
/RichMedia 11 0 R
/XFA 12 0 R
/AA 15 0 R
>>
endobj
2 0 obj
<<
/Type /Pages
/Count 2
/Kids [3 0 R 4 0 R]
>>
endobj
3 0 obj
<<
/Type /Page
/Parent 2 0 R
/Contents 13 0 R
/Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> >> >>
>>
endobj
4 0 obj
<<
/Type /Page
/Parent 2 0 R
/Contents 14 0 R
/OpenAction 5 0 R
/Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> >> >>
>>
endobj
5 0 obj
<<
/S /JavaScript
/JS (alert("Hello from JS"))
>>
endobj
6 0 obj
<<
/S /Launch
/F (dummy.exe)
>>
endobj
7 0 obj
<<
/Type /EmbeddedFile
/Length 11
>>
stream
HelloWorld
endstream
endobj
8 0 obj
<<
/Type /AcroForm
/Fields []
>>
endobj
9 0 obj
<<
/Type /ObjStm
/N 1
/First 4
/Length 30
>>
stream
17 0 << /Test /ObjStmEmbedded >>
endstream
endobj
10 0 obj
<<
/Filter /JBIG2Decode
/Length 9
>>
stream
JBIG2DATA
endstream
endobj
11 0 obj
<<
/Type /RichMedia
/Length 9
>>
stream
RichMedia
endstream
endobj
12 0 obj
<<
/Type /XFA
/Length 3
>>
stream
XFA
endstream
endobj
13 0 obj
<<
/Length 37
>>
stream
BT /F1 24 Tf 100 700 Td (Test 1) Tj ET
endstream
endobj
14 0 obj
<<
/Length 37
>>
stream
BT /F1 24 Tf 100 700 Td (Test 2) Tj ET
endstream
endobj
15 0 obj
<<
/AA << /O 5 0 R >>
>>
endobj
xref
0 17
0000000000 65535 f
0000000009 00000 n
0000000232 00000 n
0000000305 00000 n
0000000462 00000 n
0000000639 00000 n
0000000708 00000 n
0000000760 00000 n
0000000845 00000 n
0000000899 00000 n
0000001009 00000 n
0000001095 00000 n
0000001176 00000 n
0000001245 00000 n
0000001335 00000 n
0000001425 00000 n
trailer
<<
/Size 17
/Root 1 0 R
>>
startxref
1478
%%EOF