tinyxmlparser.cpp (12945B)
- /*
- Copyright (c) 2000 Lee Thomason (www.grinninglizard.com)
- This software is provided 'as-is', without any express or implied
- warranty. In no event will the authors be held liable for any
- damages arising from the use of this software.
- Permission is granted to anyone to use this software for any
- purpose, including commercial applications, and to alter it and
- redistribute it freely, subject to the following restrictions:
- 1. The origin of this software must not be misrepresented; you must
- not claim that you wrote the original software. If you use this
- software in a product, an acknowledgment in the product documentation
- would be appreciated but is not required.
- 2. Altered source versions must be plainly marked as such, and
- must not be misrepresented as being the original software.
- 3. This notice may not be removed or altered from any source
- distribution.
- */
- #include "tinyxml.h"
- #include <ctype.h>
- #include <cstring>
- const char* TiXmlBase::SkipWhiteSpace( const char* p )
- {
- while ( p && *p &&
- ( isspace( *p ) || *p == '\n' || *p == '\r' ) )
- p++;
- return p;
- }
- const char* TiXmlBase::ReadName( const char* p, std::string* name )
- {
- *name = "";
- const char* start = p;
- // Names start with letters or underscores.
- // After that, they can be letters, underscores, numbers,
- // hyphens, or colons. (Colons are valid ony for namespaces,
- // but tinyxml can't tell namespaces from names.)
- if ( p && ( isalpha( *p ) || *p == '_' ) )
- {
- p++;
- while( p && *p &&
- ( isalnum( *p )
- || *p == '_'
- || *p == '-'
- || *p == ':' ) )
- {
- p++;
- }
- name->append( start, p - start );
- return p;
- }
- return 0;
- }
- const char* TiXmlBase::ReadText( const char* p,
- std::string* text,
- bool trimWhiteSpace,
- const char* endTag,
- bool caseInsensitive )
- {
- *text = "";
- if ( !trimWhiteSpace // certain tags always keep whitespace
- /*|| !condenseWhiteSpace*/ ) // if true, whitespace is always kept
- {
- // Keep all the white space.
- while ( p && *p
- && strncmp( p, endTag, strlen(endTag) ) != 0
- )
- {
- char c = *(p++);
- (* text) += c;
- }
- }
- else
- {
- bool whitespace = false;
- // Remove leading white space:
- p = SkipWhiteSpace( p );
- while ( p && *p
- && strncmp( p, endTag, strlen(endTag) ) != 0 )
- {
- if ( *p == '\r' || *p == '\n' )
- {
- whitespace = true;
- ++p;
- }
- else if ( isspace( *p ) )
- {
- whitespace = true;
- ++p;
- }
- else
- {
- // If we've found whitespace, add it before the
- // new character. Any whitespace just becomes a space.
- if ( whitespace )
- {
- (* text) += ' ';
- whitespace = false;
- }
- char c = *(p++);
- (* text) += c;
- }
- }
- }
- return p + strlen( endTag );
- }
- const char* TiXmlDocument::Parse( const char* start )
- {
- // Parse away, at the document level. Since a document
- // contains nothing but other tags, most of what happens
- // here is skipping white space.
- const char* p = start;
- p = SkipWhiteSpace( p );
- if ( !p || !*p )
- {
- error = true;
- errorDesc = "Document empty.";
- }
- while ( p && *p )
- {
- if ( *p != '<' )
- {
- error = true;
- errorDesc = "The '<' symbol that starts a tag was not found.";
- break;
- }
- else
- {
- TiXmlNode* node = IdentifyAndParse( &p );
- if ( node )
- {
- LinkEndChild( node );
- }
- }
- p = SkipWhiteSpace( p );
- }
- return 0; // Return null is fine for a document: once it is read, the parsing is over.
- }
- TiXmlNode* TiXmlNode::IdentifyAndParse( const char** where )
- {
- const char* p = *where;
- TiXmlNode* returnNode = 0;
- assert( *p == '<' );
- TiXmlDocument* doc = GetDocument();
- p = SkipWhiteSpace( p+1 );
- // What is this thing?
- // - Elements start with a letter or underscore, but xml is reserved.
- // - Comments: <!--
- // - Everthing else is unknown to tinyxml.
- //
- if ( tolower( *(p+0) ) == '?'
- && tolower( *(p+1) ) == 'x'
- && tolower( *(p+2) ) == 'm'
- && tolower( *(p+3) ) == 'l' )
- {
- #ifdef DEBUG_PARSER
- printf( "XML parsing Declaration\n" );
- #endif
- returnNode = new TiXmlDeclaration();
- }
- else if ( isalpha( *p ) || *p == '_' )
- {
- #ifdef DEBUG_PARSER
- printf( "XML parsing Element\n" );
- #endif
- returnNode = new TiXmlElement( "" );
- }
- else if ( *(p+0) == '!'
- && *(p+1) == '-'
- && *(p+2) == '-' )
- {
- #ifdef DEBUG_PARSER
- printf( "XML parsing Comment\n" );
- #endif
- returnNode = new TiXmlComment();
- }
- else if ( strncmp(p, "![CDATA[", 8) == 0 )
- {
- TiXmlNode* cdataNode = new TiXmlCData();
- if ( !cdataNode )
- {
- if ( doc ) doc->SetError( TIXML_ERROR_OUT_OF_MEMORY );
- return 0;
- }
- returnNode = cdataNode;
- }
- else
- {
- #ifdef DEBUG_PARSER
- printf( "XML parsing Comment\n" );
- #endif
- returnNode = new TiXmlUnknown();
- }
- if ( returnNode )
- {
- // Set the parent, so it can report errors
- returnNode->parent = this;
- p = returnNode->Parse( p );
- }
- else
- {
- if ( doc )
- doc->SetError( TIXML_ERROR_OUT_OF_MEMORY );
- p = 0;
- }
- *where = p;
- return returnNode;
- }
- const char* TiXmlElement::Parse( const char* p )
- {
- TiXmlDocument* document = GetDocument();
- p = SkipWhiteSpace( p );
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT );
- return 0;
- }
- // Read the name.
- p = ReadName( p, &value );
- if ( !p )
- {
- if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME );
- return 0;
- }
- std::string endTag = "</";
- endTag += value;
- endTag += ">";
- // Check for and read attributes. Also look for an empty
- // tag or an end tag.
- while ( p && *p )
- {
- p = SkipWhiteSpace( p );
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
- return 0;
- }
- if ( *p == '/' )
- {
- // Empty tag.
- if ( *(p+1) != '>' )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY );
- return 0;
- }
- return p+2;
- }
- else if ( *p == '>' )
- {
- // Done with attributes (if there were any.)
- // Read the value -- which can include other
- // elements -- read the end tag, and return.
- p = ReadValue( p+1 ); // Note this is an Element method, and will set the error if one happens.
- if ( !p )
- return 0;
- // We should find the end tag now
- std::string buf( p, endTag.size() );
- if ( endTag == buf )
- {
- return p+endTag.size();
- }
- else
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG );
- return 0;
- }
- }
- else
- {
- // Try to read an element:
- TiXmlAttribute attrib;
- attrib.SetDocument( document );
- p = attrib.Parse( p );
- if ( p )
- {
- SetAttribute( attrib.Name(), attrib.Value() );
- }
- }
- }
- return 0;
- }
- const char* TiXmlElement::ReadValue( const char* p )
- {
- TiXmlDocument* document = GetDocument();
- // Read in text and elements in any order.
- p = SkipWhiteSpace( p );
- while ( p && *p )
- {
- const char* start = p;
- while ( *p && *p != '<' )
- p++;
- if ( !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE );
- return 0;
- }
- if ( p != start )
- {
- // Take what we have, make a text element.
- TiXmlText* text = new TiXmlText();
- if ( !text )
- {
- if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY );
- return 0;
- }
- text->Parse( start );
- if ( !text->Blank() )
- LinkEndChild( text );
- else
- delete text;
- }
- else
- {
- // We hit a '<'
- // Have we hit a new element or an end tag?
- if ( *(p+1) == '/' )
- {
- return p; // end tag
- }
- else
- {
- // TiXmlElement* element = new TiXmlElement( "" );
- //
- // if ( element )
- // {
- // p = element->Parse( p+1 );
- // if ( p )
- // LinkEndChild( element );
- // }
- // else
- // {
- // if ( document ) document->SetError( ERROR_OUT_OF_MEMORY );
- // return 0;
- // }
- TiXmlNode* node = IdentifyAndParse( &p );
- if ( node )
- {
- LinkEndChild( node );
- }
- else
- {
- return 0;
- }
- }
- }
- }
- return 0;
- }
- const char* TiXmlUnknown::Parse( const char* p )
- {
- const char* end = strchr( p, '>' );
- if ( !end )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_PARSING_UNKNOWN );
- return 0;
- }
- else
- {
- value = std::string( p, end-p );
- // value.resize( end - p );
- return end + 1; // return just past the '>'
- }
- }
- const char* TiXmlComment::Parse( const char* p )
- {
- assert( *p == '!' && *(p+1) == '-' && *(p+2) == '-' );
- // Find the end, copy the parts between to the value of
- // this object, and return.
- const char* start = p+3;
- const char* end = strstr( p, "-->" );
- if ( !end )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_PARSING_COMMENT );
- return 0;
- }
- else
- {
- // Assemble the comment, removing the white space.
- bool whiteSpace = false;
- const char* q;
- for( q=start; q<end; q++ )
- {
- if ( isspace( *q ) )
- {
- if ( !whiteSpace )
- {
- value += ' ';
- whiteSpace = true;
- }
- }
- else
- {
- value += *q;
- whiteSpace = false;
- }
- }
- // value = std::string( start, end-start );
- return end + 3; // return just past the '>'
- }
- }
- const char* TiXmlAttribute::Parse( const char* p )
- {
- // Read the name, the '=' and the value.
- p = ReadName( p, &name );
- if ( !p )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
- return 0;
- }
- p = SkipWhiteSpace( p );
- if ( !p || *p != '=' )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
- return 0;
- }
- p = SkipWhiteSpace( p+1 );
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
- return 0;
- }
- const char* end = 0;
- const char* start = p+1;
- const char* past = 0;
- if ( *p == '\'' )
- {
- end = strchr( start, '\'' );
- past = end+1;
- }
- else if ( *p == '"' )
- {
- end = strchr( start, '"' );
- past = end+1;
- }
- else
- {
- // All attribute values should be in single or double quotes.
- // But this is such a common error that the parser will try
- // its best, even without them.
- start--;
- for ( end = start; *end; end++ )
- {
- if ( isspace( *end ) || *end == '/' || *end == '>' )
- break;
- }
- past = end;
- }
- value = std::string( start, end-start );
- return past;
- }
- const char* TiXmlText::Parse( const char* p )
- {
- value = "";
- bool ignoreWhite = true;
- const char* end = "<";
- p = ReadText( p, &value, ignoreWhite, end, false );
- if ( p )
- return p-1; // don't truncate the '<'
- return 0;
- #if 0
- // Remove leading white space:
- p = SkipWhiteSpace( p );
- while ( *p && *p != '<' )
- {
- if ( *p == '\r' || *p == '\n' )
- {
- whitespace = true;
- }
- else if ( isspace( *p ) )
- {
- whitespace = true;
- }
- else
- {
- // If we've found whitespace, add it before the
- // new character. Any whitespace just becomes a space.
- if ( whitespace )
- {
- value += ' ';
- whitespace = false;
- }
- value += *p;
- }
- p++;
- }
- // Keep white space before the '<'
- if ( whitespace )
- value += ' ';
- return p;
- #endif
- }
- const char* TiXmlCData::Parse( const char* p )
- {
- value = "";
- bool ignoreWhite = false;
- p += 8;
- const char* end = "]]>";
- p = ReadText( p, &value, ignoreWhite, end, false );
- if ( p )
- return p;
- return 0;
- }
- const char* TiXmlDeclaration::Parse( const char* p )
- {
- // Find the beginning, find the end, and look for
- // the stuff in-between.
- const char* start = p+4;
- const char* end = strstr( start, "?>" );
- // Be nice to the user:
- if ( !end )
- {
- end = strstr( start, ">" );
- end++;
- }
- else
- {
- end += 2;
- }
- if ( !end )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_PARSING_DECLARATION );
- return 0;
- }
- else
- {
- const char* p;
- p = strstr( start, "version" );
- if ( p && p < end )
- {
- TiXmlAttribute attrib;
- attrib.Parse( p );
- version = attrib.Value();
- }
- p = strstr( start, "encoding" );
- if ( p && p < end )
- {
- TiXmlAttribute attrib;
- attrib.Parse( p );
- encoding = attrib.Value();
- }
- p = strstr( start, "standalone" );
- if ( p && p < end )
- {
- TiXmlAttribute attrib;
- attrib.Parse( p );
- standalone = attrib.Value();
- }
- }
- return end;
- }
- bool TiXmlText::Blank()
- {
- for ( unsigned i=0; i<value.size(); i++ )
- if ( !isspace( value[i] ) )
- return false;
- return true;
- }