clamav/clamav-devel/libclamav/htmlnorm.c

/*
 *  Copyright (C) 2004 Trog <trog@clamav.net>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif

#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#include "others.h"

#define FALSE (0)
#define TRUE (1)

/* Normalize an HTML buffer using the following rules:
	o Remove multiple contiguous spaces
	o Remove spaces around '<' and '>' in tags
	o Remove spaces around '=' in tags
	o Replace single quote with double quote in tags
	o Convert to lowercase
	o Convert all white space to a space character
*/

unsigned char *html_normalize(unsigned char *in_buff, off_t in_size)
{
	unsigned char *out_buff;
	off_t out_size=0, i;
	int had_space=FALSE, tag_depth=0, in_quote=FALSE;

	out_buff = (unsigned char *) cli_malloc(in_size+1);
	if (!out_buff) {
		cli_dbgmsg("html_normalize(): malloc failed\n");
		return NULL;
	}

	for (i=0 ; i < in_size ; i++) {
		if (in_buff[i] == '<') {
			out_buff[out_size++] = '<';
			tag_depth++;
			if (tag_depth == 1) {
				had_space=TRUE; /* consume spaces */
			}
		} else if ((in_buff[i] == '=') && (tag_depth == 1)) {
			/* Remove preceeding spaces */
			while ((out_size > 0) &&
				(out_buff[out_size-1] == ' ')) {
				out_size--;
			}
			out_buff[out_size++] = '=';
			had_space=TRUE;
		} else if (isspace(in_buff[i])) {
			if (!had_space) {
				out_buff[out_size++] = ' ';
				had_space=TRUE;
			}
		} else if (in_buff[i] == '>') {
			/* Remove preceeding spaces */
			if (tag_depth == 1) {
				while ((out_size > 0) &&
					(out_buff[out_size-1] == ' ')) {
					out_size--;
				}
			}
			out_buff[out_size++] = '>';
			tag_depth--;	
		} else if ((in_buff[i] == '\'') && (tag_depth==1)) {
			/* Convert single quotes to double quotes */
			if (in_quote || out_buff[out_size-1] == '=') {
				out_buff[out_size++] = '\"';
				in_quote = !in_quote;
			} else {
				out_buff[out_size++] = '\'';
			}
		} else {
			out_buff[out_size++] = tolower(in_buff[i]);
			had_space=FALSE;
		}
	}
	out_buff[out_size] = '\0';
	return out_buff;
}

/* Remove HTML style comments from buffer */
unsigned char *remove_html_comments(unsigned char *line)
{
	unsigned char *newline, *newcurrent;
	int in_comment=FALSE;
	
	if (!line) {
		return NULL;
	}
	
	newcurrent = newline = (unsigned char *) cli_malloc(strlen(line) + 1);
	if (!newline) {
		return NULL;
	}
	
	while(line) {
		if (!(in_comment)) {
			while (*line && *line != '<') {
				*newcurrent = *line;
				newcurrent++;
				line++;
			}
			if (! *line) {
				break;
			}
			if (!line[1]) {
				*newcurrent = *line;
				newcurrent++;
				line++;
				continue;
			}
			if (line[1] == '!') {
				in_comment = TRUE;
				line += 1;
			} else {
				*newcurrent = *line;
				newcurrent++;
				line++;
			}
		} else {
			while (*line && *line != '>') {
				line++;
			}
			if (! *line) {
				break;
			}
			in_comment = FALSE;
			line++;
		}
	}
	*newcurrent = '\0';
	return newline;
}

/* Decode an HTML escape character into it's character value */
unsigned int decode_html_char_ref(unsigned char *cref,
                                    unsigned char *dest)
{

	unsigned int hex=FALSE, value=0, count=0;
	
	if (!cref[0] || !cref[1]) {
		return 0;
	}
	
	if (((*cref == 'x') || (*cref == 'X')) && isxdigit(cref[1])) {
		hex=TRUE;
		cref++;
		count++;
	}
	
	while (isdigit(*cref) || (hex && isxdigit(*cref))) {
		if (hex) {
			value *= 16;
		} else {
			value *= 10;
		}
		value += (*cref - '0');
		cref++;
		count++;
	}
	if (*cref == ';') {
		cref++;
		count++;
	}
	
	*dest = value;
	
	return count;
}

/* Remove HTML character escape sequences from buffer */
unsigned char *remove_html_char_ref(unsigned char *line)
{
	unsigned char *newline, *newcurrent;
	unsigned char *linepos, count;
	
	if (!line) {
		return NULL;
	}
	
	newcurrent = newline = (unsigned char *) cli_malloc(strlen(line) + 1);
	if (!newline) {
		return NULL;
	}
	while (line) {
		linepos = strchr(line, '&');
		if (!linepos) {
			strcpy(newcurrent, line);
			return newline;
		}
		strncpy(newcurrent, line, linepos-line);
		newcurrent += linepos-line;

		if (!linepos[1] || !linepos[2]) {
			*newcurrent = '&';
			newcurrent++;
			line = linepos+1;
			continue;
		}
		switch (linepos[1]) {
		case '#':
			count = decode_html_char_ref(linepos+2,
					newcurrent);
			if (count > 0) {
				newcurrent++;
				linepos += count+2;
			} else {
				*newcurrent = '&';
				newcurrent++;
				linepos++;
			}
			break;
		/* TODO: character entities, &amp; etc. */
		default:
			*newcurrent = '&';
			newcurrent++;
			linepos++;
		}
		line = linepos;
	}
	*newcurrent = '\0';
	return newline;
}

int char2hex(unsigned char c)
{
	if ((c-'0') <= 9) {
		return (c-'0');
	} else if ((c-'A') <= 5) {
		return (c-'A'+10);
	}
	return (c-'a'+10);
}

char *quoted_decode(unsigned char *line, off_t in_size)
{
	unsigned char *newline, *newcurrent, *line_end;
	
	newcurrent = newline = (unsigned char *) cli_malloc(in_size + 1);
	if (!newline) {
		return NULL;
	}
	
	line_end = line+in_size;
	while (line <= line_end) {
		while ((line < line_end) && *line != '=') {
			*newcurrent = *line;
			line++;
			newcurrent++;
		}
		if ((line < line_end) && isspace(line[1])) {
			line++;
			while ((line < line_end) && isspace(*line)) {
				line++;
			}
			continue;
		}
		if ((line+2) <= line_end) {
			if (isxdigit(line[1]) && isxdigit(line[2])) {
				*newcurrent = 	(char2hex(line[1]) * 16) +
						char2hex(line[2]);
				newcurrent++;
				line += 3;
				continue;
			}
		}
		line++;	
	}
	*newcurrent = '\0';
	return newline;
}
new method of file type detection; HTML normalisation git-svn: trunk@648 2004-07-02 23:00:58 +00:00			`/*`
			`* Copyright (C) 2004 Trog <trog@clamav.net>`
			`*`
			`* This program is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation; either version 2 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program; if not, write to the Free Software`
			`* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.`
			`*/`

			`#if HAVE_CONFIG_H`
			`#include "clamav-config.h"`
			`#endif`

			`#include <stdio.h>`
			`#include <unistd.h>`
			`#include <sys/types.h>`
			`#include <sys/stat.h>`
			`#include <fcntl.h>`

			`#include "others.h"`

			`#define FALSE (0)`
			`#define TRUE (1)`

			`/* Normalize an HTML buffer using the following rules:`
			`o Remove multiple contiguous spaces`
			`o Remove spaces around '<' and '>' in tags`
			`o Remove spaces around '=' in tags`
			`o Replace single quote with double quote in tags`
			`o Convert to lowercase`
			`o Convert all white space to a space character`
			`*/`

			`unsigned char html_normalize(unsigned char in_buff, off_t in_size)`
			`{`
			`unsigned char *out_buff;`
			`off_t out_size=0, i;`
			`int had_space=FALSE, tag_depth=0, in_quote=FALSE;`

			`out_buff = (unsigned char *) cli_malloc(in_size+1);`
			`if (!out_buff) {`
			`cli_dbgmsg("html_normalize(): malloc failed\n");`
			`return NULL;`
			`}`

			`for (i=0 ; i < in_size ; i++) {`
			`if (in_buff[i] == '<') {`
			`out_buff[out_size++] = '<';`
			`tag_depth++;`
			`if (tag_depth == 1) {`
			`had_space=TRUE; /* consume spaces */`
			`}`
			`} else if ((in_buff[i] == '=') && (tag_depth == 1)) {`
			`/* Remove preceeding spaces */`
			`while ((out_size > 0) &&`
			`(out_buff[out_size-1] == ' ')) {`
			`out_size--;`
			`}`
			`out_buff[out_size++] = '=';`
			`had_space=TRUE;`
			`} else if (isspace(in_buff[i])) {`
			`if (!had_space) {`
			`out_buff[out_size++] = ' ';`
			`had_space=TRUE;`
			`}`
			`} else if (in_buff[i] == '>') {`
			`/* Remove preceeding spaces */`
			`if (tag_depth == 1) {`
			`while ((out_size > 0) &&`
			`(out_buff[out_size-1] == ' ')) {`
			`out_size--;`
			`}`
			`}`
			`out_buff[out_size++] = '>';`
			`tag_depth--;`
			`} else if ((in_buff[i] == '\'') && (tag_depth==1)) {`
			`/* Convert single quotes to double quotes */`
			`if (in_quote \|\| out_buff[out_size-1] == '=') {`
			`out_buff[out_size++] = '\"';`
			`in_quote = !in_quote;`
			`} else {`
			`out_buff[out_size++] = '\'';`
			`}`
			`} else {`
			`out_buff[out_size++] = tolower(in_buff[i]);`
			`had_space=FALSE;`
			`}`
			`}`
			`out_buff[out_size] = '\0';`
			`return out_buff;`
			`}`

			`/* Remove HTML style comments from buffer */`
			`unsigned char remove_html_comments(unsigned char line)`
			`{`
			`unsigned char newline, newcurrent;`
			`int in_comment=FALSE;`

			`if (!line) {`
			`return NULL;`
			`}`

			`newcurrent = newline = (unsigned char *) cli_malloc(strlen(line) + 1);`
			`if (!newline) {`
			`return NULL;`
			`}`

			`while(line) {`
			`if (!(in_comment)) {`
			`while (line && line != '<') {`
			`newcurrent = line;`
			`newcurrent++;`
			`line++;`
			`}`
			`if (! *line) {`
			`break;`
			`}`
			`if (!line[1]) {`
			`newcurrent = line;`
			`newcurrent++;`
			`line++;`
			`continue;`
			`}`
			`if (line[1] == '!') {`
			`in_comment = TRUE;`
			`line += 1;`
			`} else {`
			`newcurrent = line;`
			`newcurrent++;`
			`line++;`
			`}`
			`} else {`
			`while (line && line != '>') {`
			`line++;`
			`}`
			`if (! *line) {`
			`break;`
			`}`
			`in_comment = FALSE;`
			`line++;`
			`}`
			`}`
			`*newcurrent = '\0';`
			`return newline;`
			`}`

			`/* Decode an HTML escape character into it's character value */`
			`unsigned int decode_html_char_ref(unsigned char *cref,`
			`unsigned char *dest)`
			`{`

			`unsigned int hex=FALSE, value=0, count=0;`

			`if (!cref[0] \|\| !cref[1]) {`
			`return 0;`
			`}`

			`if (((cref == 'x') \|\| (cref == 'X')) && isxdigit(cref[1])) {`
			`hex=TRUE;`
			`cref++;`
			`count++;`
			`}`

			`while (isdigit(cref) \|\| (hex && isxdigit(cref))) {`
			`if (hex) {`
			`value *= 16;`
			`} else {`
			`value *= 10;`
			`}`
			`value += (*cref - '0');`
			`cref++;`
			`count++;`
			`}`
			`if (*cref == ';') {`
			`cref++;`
			`count++;`
			`}`

			`*dest = value;`

			`return count;`
			`}`

			`/* Remove HTML character escape sequences from buffer */`
			`unsigned char remove_html_char_ref(unsigned char line)`
			`{`
			`unsigned char newline, newcurrent;`
			`unsigned char *linepos, count;`

			`if (!line) {`
			`return NULL;`
			`}`

			`newcurrent = newline = (unsigned char *) cli_malloc(strlen(line) + 1);`
			`if (!newline) {`
			`return NULL;`
			`}`
			`while (line) {`
			`linepos = strchr(line, '&');`
			`if (!linepos) {`
			`strcpy(newcurrent, line);`
			`return newline;`
			`}`
			`strncpy(newcurrent, line, linepos-line);`
			`newcurrent += linepos-line;`

			`if (!linepos[1] \|\| !linepos[2]) {`
			`*newcurrent = '&';`
			`newcurrent++;`
			`line = linepos+1;`
			`continue;`
			`}`
			`switch (linepos[1]) {`
			`case '#':`
			`count = decode_html_char_ref(linepos+2,`
			`newcurrent);`
			`if (count > 0) {`
			`newcurrent++;`
			`linepos += count+2;`
			`} else {`
			`*newcurrent = '&';`
			`newcurrent++;`
			`linepos++;`
			`}`
			`break;`
			`/* TODO: character entities, & etc. */`
			`default:`
			`*newcurrent = '&';`
			`newcurrent++;`
			`linepos++;`
			`}`
			`line = linepos;`
			`}`
			`*newcurrent = '\0';`
			`return newline;`
			`}`

			`int char2hex(unsigned char c)`
			`{`
			`if ((c-'0') <= 9) {`
			`return (c-'0');`
			`} else if ((c-'A') <= 5) {`
			`return (c-'A'+10);`
			`}`
			`return (c-'a'+10);`
			`}`

			`char quoted_decode(unsigned char line, off_t in_size)`
			`{`
			`unsigned char newline, newcurrent, *line_end;`

			`newcurrent = newline = (unsigned char *) cli_malloc(in_size + 1);`
			`if (!newline) {`
			`return NULL;`
			`}`

			`line_end = line+in_size;`
			`while (line <= line_end) {`
			`while ((line < line_end) && *line != '=') {`
			`newcurrent = line;`
			`line++;`
			`newcurrent++;`
			`}`
			`if ((line < line_end) && isspace(line[1])) {`
			`line++;`
			`while ((line < line_end) && isspace(*line)) {`
			`line++;`
			`}`
			`continue;`
			`}`
			`if ((line+2) <= line_end) {`
			`if (isxdigit(line[1]) && isxdigit(line[2])) {`
			`newcurrent = (char2hex(line[1]) 16) +`
			`char2hex(line[2]);`
			`newcurrent++;`
			`line += 3;`
			`continue;`
			`}`
			`}`
			`line++;`
			`}`
			`*newcurrent = '\0';`
			`return newline;`
			`}`