#!/usr/bin/perl
##########################################################################
#
# xlint.pl
#
# This program acts as an error-tolerant XML parser. It reports all the
# detected errors as it parses through the XML document and never
# terminates before it reaches the end of the document.
#
# Author : Yuhui Jin
# Juan Fernando Arguello
# Stanford University
# Last Modified : December 24, 2003
#
# Acknowledgement: This work is supported by US Air Force and the DARPA
# DAML project "OntoAgents" (01IN901C0).
#
#
##########################################################################
############## Main program ##############
# Define global variables
## Error types to label each error, used in error reporting
@errType = ("Syntax error for the xml declaration", #0
"Expect the attribute for version info", #1
"Invalid version number assignment", #2
"Invalid encoding name assignment", #3
"Invalid assignment for standalone attribute", #4
"Attribute not expected at this location", #5
"Syntax error for the tag", #6
"Duplicate DocType declaration", #7
"Missing the start tag", #8
"Missing the end tag", #9
"Syntax error for the comment", #10
"Syntax error for the processing instruction", #11
"Expect white space", #12
"Syntax error for attribute assignment" #13
);
## Line number of the current line being parsed
$linenum = 0;
## Store the line being parsed
$line = "";
## The position in the line being parsed
$pos = 0;
## The flag for verbose mode of error reporting.
## If it is set, error context consisting of N characters
## will be shown as well as the position of the error.
## The default is off.
$verbose = 0;
## The length of the error context
$context = 30;
## The counter for total number of errors
$errorCount = 0;
## Cache the line being procesed for reporting error in context
$cachedCurLine = "";
# Check correct command line usage of the parser
if ($#ARGV < 0) {
print "USAGE: ";
print "perl xlint.pl <file_name> [-v |-v <number_of_chars>]\n";
print "<file_name>: the absolute or relative file name of the XML document to be parsed.\n";
print "-v: the verbose mode where a context of N characters around the error ".
"position is displayed. The default value for N is 30.\n";
print "<number_of_chars>: the value for N, the length of error context.\n";
exit(1);
}
## get input file name
$infile = $ARGV[0];
## open source file
open(INPUT, "$infile") || die "Can't open file \"$infile\".\n";
## check whether we have verbose mode set.
if (($#ARGV >= 1) && ($ARGV[1] eq "-v")) {
$verbose = 1;
## check whether we have the context length set
if ($#ARGV >= 2) {
$context = int($ARGV[2]);
}
}
## start parsing the document
&parseDoc();
## report error count
&printErrorSummary();
exit(0);
############## Subroutines ##############
# Root of the parser
sub parseDoc() {
&parseProlog();
&parseBody();
close(INPUT);
&cleanupStack();
}
# Parse the prolog part of the document
#
# prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
# XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
# Misc ::= Comment | PI | S
# Comment ::= '<!--' ((char - '-') | ('-' (char - '-')))* '-->'
# PI ::= '<?' PITarget (char* - '?>')? '?>'
# S ::= (#x20 | #x9 | #xD | #xA)+ (i.e., white space characters)
#
# More in the XML specification 1.0 at www.w3.org
sub parseProlog() {
my ($err, $doctype, $depth, $stop, $more);
$line = &nonBlankLine();
$pos = 0;
if (defined($line)) {
## parse the xmlDecl part
if ($line =~ /(.*)<\?xml(.*)/) {
$pos = length($1) + 5;
$more = $2;
if ($1 =~ /[\s]*(.*)[\s]*/) {
if (length($1) != 0) {
## wrong syntax
&addError($errType[0], $linenum, 0, $cachedCurLine);
}
}
$line = &parseXMLDecl($more, $pos);
}
}
if ($line =~ /[\s]*(.*)[\s]*/) {
$line = $1;
}
if (length($line) == 0) {
$line = &nonBlankLine();
$pos = 0;
}
$stop = 0;
$doctype = 0;
## parse Misc, which could be either comment, PI or doctypedecl;
## and doctypedecl can exist at most once.
while ((defined($line)) && (!$stop)) {
## parse tags
if ($line =~ /([^<>]*)(<.*)/) {
$line = $2;
## Parse doctypedecl start tags <!DOCTYPE...>
if ($line =~ /(<!DOCTYPE[^<>]*>)(.*)/) {
$line = $2;
$pos = $pos + length($1);
if ($doctype == 0) {
$doctype = 1;
## Todo: handle nested DTD in multiple lines
} else {
## duplicate doctype
&addError($errType[7], $linenum, $pos, $cachedCurLine);
}
}
## Parse comment <!-- ... -->
elsif (($line =~ /^<!--/) && ($line =~ /(<!--.*-->)(.*)/)) {
$line = $2;
$pos = $pos + length($1);
}
## catch incomplete comment starting with <!
elsif (($line =~ /^<!/) && ($line =~ /(<!.*>)(.*)/)) {
$line = $2;
$pos = $pos + length($1);
## incomplete comment
&addError($errType[10], $linenum, $pos-1, $cachedCurLine);
}
## catch incomplete comment end with ->
elsif ($line =~ /(<.*->)(.*)/) {
$line = $2;
$pos = $pos + length($1);
## incomplete comment
&addError($errType[10], $linenum, $pos-1, $cachedCurLine);
}
## parse PI <? ... ?>
elsif (($line =~ /^<\?/) && ($line =~ /(<\?.*\?>)(.*)/)) {
$line = $2;
$pos = $pos + length($1);
}
## catch incomplete PI starting with <?
elsif (($line =~ /^<\?/) && ($line =~ /(<\?.*>)(.*)/)) {
$line = $2;
$pos = $pos + length($1);
## incomplete PI
&addError($errType[11], $linenum, $pos-1, $cachedCurLine);
}
## catch incomplete PI end with ?>
elsif ($line =~ /(<.*\?>)(.*)/) {
$line = $2;
$pos = $pos + length($1);
## incomplete PI
&addError($errType[11], $linenum, $pos-1, $cachedCurLine);
}
## assume the beginning of element body, need to exit the subroutine
else {
$stop = 1;
}
}
# missing <, skip to the next <
elsif ($line =~ /([^<>]*)>(.*)/) {
$line = $2;
$pos = $pos + length($1) + 1;
## wrong syntax for a tag
&addError($errType[6]." for \"".$1."\"", $linenum, $pos-1, $cachedCurLine);
}
## skip to new line since no < or > (encountering test data)
else {
$line = &nonBlankLine();
$pos = 0;
}
}
}
# Parse the XMLDecl part
sub parseXMLDecl() {
my ($decl, $pos) = @_;
my ($err, $more);
$err = 0;
## parse the versionInfo
if ($decl =~ /(([\s]*)version[\s]*=[\s]*)(.*)/) {
if (length($2) == 0) {
## need whitespace
&addError($errType[12], $linenum, $pos, $cachedCurLine);
}
## parse the version number
$pos = $pos + length($1);
$more = $3;
if ($more =~ /(("|')?([a-zA-Z0-9_\.:]|-)+("|')?)(.*)/) {
$pos = $pos + length($1);
$more = $5;
} else {
## wrong version number format
&addError($errType[2], $linenum, $pos, $cachedCurLine);
$err = 1;
}
## parse the optional EncodingDecl
if (!$err) {
if ($more =~ /(([\s]*)encoding[\s]*=[\s]*)(.*)/) {
if (length($2) == 0) {
## need whitespace
&addError($errType[12], $linenum, $pos, $cachedCurLine);
}
$pos = $pos + length($1);
$more = $3;
if ($more =~ /(("|')([A-Za-z]([A-Za-z0-9\._]|-)*)("|'))(.*)/) {
$pos = $pos + length($1);
$more = $6;
} else {
## wrong encoding name
&addError($errType[3], $linenum, $pos, $cachedCurLine);
$err = 1;
}
}
}
## parse the optional SDDecl
if (!$err) {
if ($more =~ /(([\s]*)standalone[\s]*=[\s]*)(.*)/) {
if (length($2) == 0) {
## need whitespace
&addError($errType[12], $linenum, $pos, $cachedCurLine);
}
$pos = $pos + length($1);
$more = $3;
if ($more =~ /(("|')(yes|no)("|'))(.*)/) {
$pos = $pos + $1;
$more = $5;
} else {
## wrong Standalone Document Declaration
&addError($errType[4], $linenum, $pos, $cachedCurLine);
$err = 1;
}
}
}
if (!$err) {
if ($more =~ /(.*)\?>(.*)/) {
$tmp = $pos;
$pos = $pos + length($1) + 2;
$more = $2;
if ($1 =~ /[\s]*(.*)[\s]*/) {
if (length($1) != 0) {
## wrong attributes
&addError($errType[5], $linenum, $tmp, $cachedCurLine);
$err = 1;
}
}
} else {
## missing end tag
&addError($errType[6], $linenum, $pos, $cachedCurLine);
$err = 1;
}
}
} else {
## wrong version attribute
&addError($errType[1], $linenum, $pos, $cachedCurLine);
$err = 1;
}
return $more;
}
# Parse the document body containing nested tagged content,
# comments and processing instructions.
sub parseBody() {
while (defined($line)) {
## parse tag starting with <.
if ($line =~ /([^<>]*)(<.*)/) {
$line = $2;
$pos = $pos + length($1);
## "<!-- ... -->": parse the comment
if (($line =~ /^<!--/) && ($line =~ /(<!--.*-->)(.*)/)) {
$line = $2;
$pos = $pos + length($1);
}
## catch incomplete comment starting with <!
elsif (($line =~ /^<!/) && ($line =~ /(<!.*>)(.*)/)) {
$line = $2;
$pos = $pos + length($1);
## incomplete comment
&addError($errType[10], $linenum, $pos-1, $cachedCurLine);
}
## catch incomplete comment end with ->
elsif ($line =~ /(<.*->)(.*)/) {
$line = $2;
$pos = $pos + length($1);
## incomplete comment
&addError($errType[10], $linenum, $pos-1, $cachedCurLine);
}
## parse PI <? ... ?>
elsif (($line =~ /^<\?/) && ($line =~ /(<\?.*\?>)(.*)/)) {
$line = $2;
$pos = $pos + length($1);
}
## catch incomplete PI starting with <?
elsif (($line =~ /^<\?/) && ($line =~ /(<\?.*>)(.*)/)) {
$line = $2;
$pos = $pos + length($1);
## incomplete PI
&addError($errType[11], $linenum, $pos-1, $cachedCurLine);
}
## catch incomplete PI end with ?>
elsif ($line =~ /(<.*\?>)(.*)/) {
$line = $2;
$pos = $pos + length($1);
## incomplete PI
&addError($errType[11], $linenum, $pos-1, $cachedCurLine);
}
## "< ... >": parse the tag content
elsif ($line =~ /<([^<>]*)>(.*)/) {
$content = $1;
$line = $2;
$prevpos = $pos; # cache the pos for error reporting on attribute-value pairs
$pos = $pos + length($1) + 2;
## "< ... />": parse an empty tag (with no matching tags)
if ($content =~ /\/$/) {
}
## "</ ... >": parse and match the tag with existing ones.
elsif ($content =~ /^\//) {
if ($content =~ /\/(.*)/) {
## check whether there is a match with a start tag in the stack
$depth = &locateTagname($1);
if ($depth >= 0) {
&popFrom($depth+1);
&pop();
} else {
## miss the start tag
&addError($errType[8]." for \"".$content."\"", $linenum, $pos-1, $cachedCurLine);
}
}
}
## "< ... >": parse the start tag
else {
## check whether we have attribute-value pairs
if ($content =~ /([^\s]+)(\s+)(.*)/) {
## set the tag name
$content = $1;
## parse the attribute-value pair list
&parseAttributes($3, $prevpos + length($1) + length($2) + 1);
}
&push($content, $linenum, $pos);
}
}
## incomplete tag
else {
## wrong syntax for a tag
&addError($errType[6], $linenum, $pos, $cachedCurLine);
if ($line =~ /(<[^<>]*)(<.*)/) {
$line = $2;
$pos = $pos + length($1);
}
## skip to new line if no < detected (encountering test data)
else {
$line = &nonBlankLine();
$pos = 0;
}
}
}
## parse tag starting with > (missing <)
## We assume a tag will not be seperated on different lines.
elsif ($line =~ /([^<>]*)>(.*)/) {
$line = $2;
$pos = $pos + length($1) + 1;
## wrong syntax for a tag
&addError($errType[6], $linenum, $pos-1, $cachedCurLine);
}
## skip to new line if no < or > is detected (encountering test data)
else {
$line = &nonBlankLine();
$pos = 0;
}
} # while loop
}
# Parse the attribute-value pairs
sub parseAttributes() {
my ($list, $p) = @_;
## we scan the list backwards from the end to the front,
## verifying the format of each attribute-value pair. This is
## because the pattern matching is greedy.
## set position to the end of the list as we start backwards
$p = $p + length($list);
while (length($list) != 0) {
## extract the attribute-value pair at the end of the list
if ($list =~ /(.*)([^\s\"=]+\s*=\s*\"[^\"]*\")(.*)/) {
$list = $1;
$tmp = $p - length($3);
$p = $p - length($2) - length($3);
## check whether there is content after the matched pair
if ($3 =~ /[\s]*(.*)/) {
if (length($1) != 0) {
## report error on attribute assignment syntax
&addError($errType[13], $linenum, $tmp, $cachedCurLine);
}
}
## remove any white space before the list
## making the list empty if it only consists of white spaces
if ($list =~ /[\s]*(.*)/) {
$list = $1;
}
} else {
## report error if the list is not empty but does not contain
## any attribute-value pair
$p = $p - length($list);
&addError($errType[13], $linenum, $p, $cachedCurLine);
$list = ""; ## to exit the loop
}
} # while loop
}
# Return the next non-blank line in the document
sub nonBlankLine() {
my $line = <INPUT>;
my $isBlank = 0;
## while loop for getting a non-blank line
while (defined($line) && ($isBlank == 0)) {
$linenum++;
if ($line =~ /\n$/) {
chop($line);
}
## remove any while space characters (space, tab, newline, etc.)
## before the string
if ($line =~ /[\s]*(.*)[\s]*/) {
$line = $1;
}
if (length($line) != 0) {
$isBlank = 1;
}
else {
$line = <INPUT>;
}
}
## cache the line to be processed for reporting error context.
$cachedCurLine = $line;
return $line;
}
# Add a new error to the error log
sub addError() {
my ($et, $en, $p, $cl) = @_;
my ($offset, $len, $substr);
$errorCount++;
print "Line $en, Col $p: \t$et.";
## check whether we need to report in verbose mode
if ($verbose) {
$offset = $p - $context / 2;
if ($offset < 0) { $offset = 0; }
$len = $context;
if ($offset + $len > length($cl)) {
$len = length($cl) - $offset - 1;
}
$substr = substr($cl, $offset, $len);
print "\t(...$substr...)";
}
print "\n";
}
## Print a summary of errors
sub printErrorSummary() {
if ($errorCount > 1) {
print "$errorCount errors found.\n";
} else {
print "$errorCount error found.\n";
}
}
##### Stack Implementation ######
# Push the record into the Stack
sub push() {
my ($tn, $ln, $p) = @_;
## chop off spaces around tag name
if ($tn =~ /[\s]*(.*)[\s]*/) {
$tn = $1;
}
@tagname[$top] = $tn;
@linenum[$top] = $ln;
@pos[$top] = $p;
## save the current line implicitly for reporting error context.
@cachedLine[$top] = $cachedCurLine;
$top++;
}
# Pop the top record from the Stack
sub pop() {
$top--;
}
# Return 1 if Stack is empty, 0 otherwise
sub empty() {
if (top == 0) {
return 1;
} else {
return 0;
}
}
# Return the Stack depth of the given tagname
sub locateTagname() {
my($tn) = @_;
my($d);
$d = $top-1;
$found = 0;
## chop off spaces around tag name
if ($tn =~ /[\s]*(.*)[\s]/) {
$tn = $1;
}
while (($d >= 0) && (!$found)) {
if ($tagname[$d] eq $tn) {
$found = 1;
} else {
$d--;
}
}
return $d;
}
# Pop the records from the given depth to the top of the Stack
sub popFrom() {
my($d) = @_;
my($i);
if (($d >= 0) && ($d < $top)) {
## pop each of the records from given depth up to the top
## for each records, report an error of missing end tag
for ($i=$d; $i<$top; $i++) {
## missing end tag
&addError($errType[9]." for \"".$tagname[$i]."\"",
$linenum[$i], $pos[$i], $cachedLine[$i]);
}
$top = $d;
}
}
# Clean up the Stack before we exit the program - check whether we have
# any unmatched records in the stack. Report error for each of them.
sub cleanupStack() {
for ($i=0; $i<$top; $i++) {
## missing end tag
&addError($errType[9]." for \"".$tagname[$i]."\"", $linenum[$i],
$pos[$i], $cachedLine[$i]);
}
$top = 0;
}
# Print the stack content for debugging
sub printStack() {
print "Stack: ";
for ($i=0; $i<$top; $i++) {
print "($tagname[$i], $linenum[$i], $pos[$i])\t";
}
print "\n";
}