parsing.h

Go to the documentation of this file.
00001 /*
00002  * parsing.h
00003  *
00004  * Copyright (C) 2007,2010  Thomas A. Vaughan
00005  * All rights reserved.
00006  *
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions are met:
00010  *     * Redistributions of source code must retain the above copyright
00011  *       notice, this list of conditions and the following disclaimer.
00012  *     * Redistributions in binary form must reproduce the above copyright
00013  *       notice, this list of conditions and the following disclaimer in the
00014  *       documentation and/or other materials provided with the distribution.
00015  *     * Neither the name of the <organization> nor the
00016  *       names of its contributors may be used to endorse or promote products
00017  *       derived from this software without specific prior written permission.
00018  *
00019  * THIS SOFTWARE IS PROVIDED BY THOMAS A. VAUGHAN ''AS IS'' AND ANY
00020  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
00021  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
00022  * DISCLAIMED. IN NO EVENT SHALL THOMAS A. VAUGHAN BE LIABLE FOR ANY
00023  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
00024  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00025  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
00026  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00027  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00028  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029  *
00030  *
00031  * Simple parsing helper methods.
00032  */
00033 
00034 #ifndef WAVEPACKET_PARSING_H__
00035 #define WAVEPACKET_PARSING_H__
00036 
00037 // includes --------------------------------------------------------------------
00038 #include "common/common.h"
00039 
00040 #include <iostream>
00041 
00042 
00043 /// \ingroup util
00044 /*@{*/
00045 
00046 ////////////////////////////////////////////////////////////////////////////////
00047 //
00048 //      parsing helper methods
00049 //
00050 ////////////////////////////////////////////////////////////////////////////////
00051 
00052 /// a UTF-8 character.  Note that this is multiple 8-bit chars in a sequence.
00053 ///     See http://en.wikipedia.org/wiki/Utf8
00054 struct utf8_char_t {
00055         enum eConstants {
00056                 eHighBit        = 0x80, ///< high bit set
00057                 eTwoHigh        = 0xC0, ///< both high bits set
00058                 eMaxChars       = 6,    ///< from RFC 3629
00059                 eBufferSize     = 8     ///< internal use only--keep dword align
00060         };
00061 
00062         // constructor, manipulators
00063         utf8_char_t(void) throw() { this->clear(); }
00064         void clear(void) throw() {
00065                         nBytes = 1;
00066                         value[0] = 0;   // null character
00067                 }
00068         int getByteCount(void) const throw() { return nBytes; }
00069         int getValue(void) const throw();
00070         void setToReplacement(void) throw();
00071         void dump(IN const char * title) const throw();
00072 
00073         /// "!c" means c is a null character, or invalid
00074         bool operator !(void) const throw() {
00075                         return (nBytes < 1) || (0 == value[0]);
00076                 }
00077 
00078         /// inversion of ! operator: c is true if it is valid and non-null
00079         operator bool (void) const throw() {
00080                         return (nBytes > 0) && (nBytes <= eMaxChars) && value[0];
00081                 }
00082 
00083         /// is this valid?  Can be both valid and null!
00084         bool isValid(void) const throw() {
00085                         // note that we are not doing full validation of bit
00086                         // flags
00087                         return (nBytes > 0) && (nBytes <= eMaxChars);
00088                 }
00089 
00090         /// single-byte character comparison
00091         bool operator == (IN char a) const throw() {
00092                         return (1 == nBytes) && (value[0] == a);
00093                 }
00094 
00095         // data fields
00096         char            value[eBufferSize];
00097         int             nBytes;
00098 };
00099 
00100 
00101 
00102 /// is this character a single-byte (ASCII) UTF-8 character?
00103 inline bool isSingleByteUTF8Character(IN char a) throw()
00104 {
00105         // single-byte character if high bit is NOT set
00106         return (!(utf8_char_t::eHighBit & a));
00107 }
00108 
00109 
00110 
00111 /// is this character the leading byte of a multi-byte UTF-8 character?
00112 inline bool isMultiByteUTF8Character(IN char a) throw()
00113 {
00114         // multi-byte leading character if both high bits set
00115         return ((utf8_char_t::eTwoHigh & (byte_t) a) == utf8_char_t::eTwoHigh);
00116 }
00117 
00118 
00119 
00120 /// is this character a (likely) leading byte for a UTF-8 character?
00121 inline bool isLeadingUTF8Byte(IN char a) throw()
00122 {
00123         // leading character if high bit is unset, or both high bits set
00124         return (isSingleByteUTF8Character(a) ||
00125                 isMultiByteUTF8Character(a));
00126 }
00127 
00128 
00129 
00130 /// is this character a trailing byte for a multi-byte UTF-8 character?
00131 inline bool isTrailingUTF8Byte(IN char a) throw()
00132 {
00133         // trailing character if high bit is set, next bit not set
00134         return ((utf8_char_t::eTwoHigh & (byte_t) a) == utf8_char_t::eHighBit);
00135 }
00136 
00137 
00138 
00139 /// get the next (UTF-8) character from the stream
00140 ///     returns false on eof (end of stream)
00141 bool getUTF8CharacterFromStream(IN std::istream& stream,
00142                         OUT utf8_char_t& c);
00143 
00144 
00145 
00146 /// get the next (UTF-8) character from the given string
00147 ///     returns a pointer to the next character in the string
00148 const char * getUTF8CharacterFromString(IN const char * input,
00149                         OUT utf8_char_t& c);
00150 
00151 
00152 
00153 /// given an initial byte of a (potentially) multi-byte UTF-8 character,
00154 ///     return how many bytes total in the character
00155 /// returns -1 on error.
00156 int getUTF8ByteCount(IN char a);
00157 
00158 
00159 
00160 // sigh... Win32 version of isspace() is broken for UTF8!  need to wrap
00161 #ifdef WIN32
00162 // win32 - force casting to char so win32 doesn't freak out
00163 #define isSpace( c ) isspace( (0xff & ((char) c)) )
00164 #else   // WIN32
00165 // other OS's: isspace() is okay
00166 #define isSpace( c ) isspace(c)
00167 #endif  // WIN32
00168 
00169 /// retrieves the next word from the specified line of text, and returns a
00170 ///     pointer to the first character in the line after the word.  Leading
00171 ///     whitespace will be ignored when parsing.
00172 ///     The word returned will either be empty, a newline, or a set of
00173 ///     non-space characters.  If the returned word is empty, then NULL is
00174 ///     also the return value (word can only be empty at end-of-line).  Note
00175 ///     that you can have non-empty words returned with a NULL return value.
00176 const char * getNextWord(IN const char * text,
00177                         OUT std::string& word);
00178 
00179 
00180 /// eParseBehavior: these can be OR'd together
00181 enum eParseBehavior {
00182         // fundamental behaviors
00183         eParse_None             = 0x0000,       ///< no special behavior
00184         eParse_StripComments    = 0x0001,       ///< strip comments (start with #)
00185         eParse_StripBogus       = 0x0002,       ///< strip bogus characters ('\\r')
00186         eParse_RespectQuotes    = 0x0004,       ///< treat quoted tokens as a unit
00187 
00188         // common combinations
00189         eParse_Strip            = 0x0003,       ///< strip comments + bogus
00190 
00191         // must be last!
00192         eParse_Invalid          = 0x8000
00193 };
00194 
00195 
00196 ///   Reads to the next newline or end of stream, and stashes all characters
00197 ///     (except the final newline) in the output string.
00198 ///   Caller can ask to have comments and/or bogus characters stripped out.
00199 ///   returns a std::string containing the line.  throws on errors
00200 ///     This routine is UTF-8 compliant.
00201 std::string getNextLineFromStream(IN std::istream& stream,
00202                                 IN eParseBehavior behavior);
00203 
00204 
00205 ///   finds the next whitespace-delimited token, and puts it in the given token
00206 ///     (std::string).
00207 ///   returns a pointer to the character (whitespace or end of string) right
00208 ///     after the token.
00209 ///     This routine is UTF-8 compliant.
00210 const char * getNextTokenFromString(IN const char * input,
00211                                 OUT std::string& token,
00212                                 IN eParseBehavior);
00213 
00214 
00215 ///  finds the next whitespace-delimited token, and puts it in the given
00216 ///    buffer.
00217 ///  returns a pointer to the character right after the token
00218 ///  the chars parameter returns how many characters are in the buffer.
00219 ///  the buffer is null-terminated
00220 ///  if the buffer wasn't big enough for the token, the token will be
00221 ///    truncated.  The buffer will be null-terminated (the routine
00222 ///    saves enough room for the final null).  chars will contain the
00223 ///    number of characters parsed from the string, even if not all of
00224 ///    them were copied into the buffer.  So you can detect truncation
00225 ///    if chars >= buffer_size.
00226 ///     This routine is UTF-8 compliant
00227 const char * getNextTokenFromString(IN const char * input,
00228                                 IO char * buffer,
00229                                 IN int buffer_size,
00230                                 IN eParseBehavior,
00231                                 OUT int& chars) throw();
00232 
00233 
00234 /// expects the given token to be next in the string.  Throws if the
00235 ///     next token is something else.  Returns a pointer to the character
00236 ///     immediately following the expected token.
00237 const char * expectFromString(IN const char * input,
00238                                 IN const char * expect,
00239                                 IN eParseBehavior);
00240 
00241 
00242 /// helper method to extract a float (assuming it is the next token)
00243 ///     This routine is UTF-8 compliant
00244 const char * getFloatFromString(IN const char * input,
00245                                 OUT float& x) throw();
00246 
00247 
00248 /// helper method to read up to N floats.  Returns the number read.
00249 /// Assumes that that caller doesn't need the string returned!
00250 /// The caller must provide a pre-allocated array of floats for
00251 ///     output.
00252 int readFloatsFromString(IN const char * input,
00253                                 IN int nFloats,
00254                                 OUT float * output) throw();
00255 
00256 
00257 /// tries to determine if the given string is a true or false value.
00258 ///     This is NOT localized.  This is for machine-parsed input only,
00259 ///     such as config files.  A string is considered false if it is
00260 ///     the word "false" (any case), the single letter "F", the
00261 ///     character "0", or empty.  Any other value is considered true.
00262 ///     Note that expressions that evaluate to zero but are not a
00263 ///     single digit, such as "-0" or "+0", are considered true.
00264 bool getBooleanFromString(IN const char * input) throw();
00265 
00266 
00267 ///    true if the given character is bogus ('\\r', etc)
00268 bool isBogus(IN char a) throw();
00269 
00270 
00271 /// dictionary routines
00272 ///   These let you construct a dictionary (key -> value map) based on an input
00273 ///   string, and then look for required/optional values.
00274 typedef Dictionary dictionary_t;        // map of key->value pairs
00275 
00276 void getDictionaryFromString(IN const char * string,
00277                                 IN const char * debug_info,
00278                                 OUT dictionary_t& data);
00279 
00280 const char * getValue(IN const dictionary_t&, IN const char * key);
00281 const char * getRequiredValue(IN const dictionary_t&, IN const char * key);
00282 const char * getOptionalValue(IN const dictionary_t&, IN const char * key,
00283                         IN const char * default_value) throw();
00284 
00285 
00286 /// inplace conversion to lower case (returns pointer to beginning of string)
00287 //char * lowercase(char * text) throw();
00288 
00289 #endif  // WAVEPACKET_PARSING_H__
00290