00001 /* 00002 * parsing.h 00003 * 00004 * Copyright (C) 2007,2010 Thomas A. Vaughan 00005 * All rights reserved. 00006 * 00007 * 00008 * Redistribution and use in source and binary forms, with or without 00009 * modification, are permitted provided that the following conditions are met: 00010 * * Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * * Redistributions in binary form must reproduce the above copyright 00013 * notice, this list of conditions and the following disclaimer in the 00014 * documentation and/or other materials provided with the distribution. 00015 * * Neither the name of the <organization> nor the 00016 * names of its contributors may be used to endorse or promote products 00017 * derived from this software without specific prior written permission. 00018 * 00019 * THIS SOFTWARE IS PROVIDED BY THOMAS A. VAUGHAN ''AS IS'' AND ANY 00020 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 00021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 00022 * DISCLAIMED. IN NO EVENT SHALL THOMAS A. VAUGHAN BE LIABLE FOR ANY 00023 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 00024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 00026 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00029 * 00030 * 00031 * Simple parsing helper methods. 00032 */ 00033 00034 #ifndef WAVEPACKET_PARSING_H__ 00035 #define WAVEPACKET_PARSING_H__ 00036 00037 // includes -------------------------------------------------------------------- 00038 #include "common/common.h" 00039 00040 #include <iostream> 00041 00042 00043 /// \ingroup util 00044 /*@{*/ 00045 00046 //////////////////////////////////////////////////////////////////////////////// 00047 // 00048 // parsing helper methods 00049 // 00050 //////////////////////////////////////////////////////////////////////////////// 00051 00052 /// a UTF-8 character. Note that this is multiple 8-bit chars in a sequence. 00053 /// See http://en.wikipedia.org/wiki/Utf8 00054 struct utf8_char_t { 00055 enum eConstants { 00056 eHighBit = 0x80, ///< high bit set 00057 eTwoHigh = 0xC0, ///< both high bits set 00058 eMaxChars = 6, ///< from RFC 3629 00059 eBufferSize = 8 ///< internal use only--keep dword align 00060 }; 00061 00062 // constructor, manipulators 00063 utf8_char_t(void) throw() { this->clear(); } 00064 void clear(void) throw() { 00065 nBytes = 1; 00066 value[0] = 0; // null character 00067 } 00068 int getByteCount(void) const throw() { return nBytes; } 00069 int getValue(void) const throw(); 00070 void setToReplacement(void) throw(); 00071 void dump(IN const char * title) const throw(); 00072 00073 /// "!c" means c is a null character, or invalid 00074 bool operator !(void) const throw() { 00075 return (nBytes < 1) || (0 == value[0]); 00076 } 00077 00078 /// inversion of ! operator: c is true if it is valid and non-null 00079 operator bool (void) const throw() { 00080 return (nBytes > 0) && (nBytes <= eMaxChars) && value[0]; 00081 } 00082 00083 /// is this valid? Can be both valid and null! 00084 bool isValid(void) const throw() { 00085 // note that we are not doing full validation of bit 00086 // flags 00087 return (nBytes > 0) && (nBytes <= eMaxChars); 00088 } 00089 00090 /// single-byte character comparison 00091 bool operator == (IN char a) const throw() { 00092 return (1 == nBytes) && (value[0] == a); 00093 } 00094 00095 // data fields 00096 char value[eBufferSize]; 00097 int nBytes; 00098 }; 00099 00100 00101 00102 /// is this character a single-byte (ASCII) UTF-8 character? 00103 inline bool isSingleByteUTF8Character(IN char a) throw() 00104 { 00105 // single-byte character if high bit is NOT set 00106 return (!(utf8_char_t::eHighBit & a)); 00107 } 00108 00109 00110 00111 /// is this character the leading byte of a multi-byte UTF-8 character? 00112 inline bool isMultiByteUTF8Character(IN char a) throw() 00113 { 00114 // multi-byte leading character if both high bits set 00115 return ((utf8_char_t::eTwoHigh & (byte_t) a) == utf8_char_t::eTwoHigh); 00116 } 00117 00118 00119 00120 /// is this character a (likely) leading byte for a UTF-8 character? 00121 inline bool isLeadingUTF8Byte(IN char a) throw() 00122 { 00123 // leading character if high bit is unset, or both high bits set 00124 return (isSingleByteUTF8Character(a) || 00125 isMultiByteUTF8Character(a)); 00126 } 00127 00128 00129 00130 /// is this character a trailing byte for a multi-byte UTF-8 character? 00131 inline bool isTrailingUTF8Byte(IN char a) throw() 00132 { 00133 // trailing character if high bit is set, next bit not set 00134 return ((utf8_char_t::eTwoHigh & (byte_t) a) == utf8_char_t::eHighBit); 00135 } 00136 00137 00138 00139 /// get the next (UTF-8) character from the stream 00140 /// returns false on eof (end of stream) 00141 bool getUTF8CharacterFromStream(IN std::istream& stream, 00142 OUT utf8_char_t& c); 00143 00144 00145 00146 /// get the next (UTF-8) character from the given string 00147 /// returns a pointer to the next character in the string 00148 const char * getUTF8CharacterFromString(IN const char * input, 00149 OUT utf8_char_t& c); 00150 00151 00152 00153 /// given an initial byte of a (potentially) multi-byte UTF-8 character, 00154 /// return how many bytes total in the character 00155 /// returns -1 on error. 00156 int getUTF8ByteCount(IN char a); 00157 00158 00159 00160 // sigh... Win32 version of isspace() is broken for UTF8! need to wrap 00161 #ifdef WIN32 00162 // win32 - force casting to char so win32 doesn't freak out 00163 #define isSpace( c ) isspace( (0xff & ((char) c)) ) 00164 #else // WIN32 00165 // other OS's: isspace() is okay 00166 #define isSpace( c ) isspace(c) 00167 #endif // WIN32 00168 00169 /// retrieves the next word from the specified line of text, and returns a 00170 /// pointer to the first character in the line after the word. Leading 00171 /// whitespace will be ignored when parsing. 00172 /// The word returned will either be empty, a newline, or a set of 00173 /// non-space characters. If the returned word is empty, then NULL is 00174 /// also the return value (word can only be empty at end-of-line). Note 00175 /// that you can have non-empty words returned with a NULL return value. 00176 const char * getNextWord(IN const char * text, 00177 OUT std::string& word); 00178 00179 00180 /// eParseBehavior: these can be OR'd together 00181 enum eParseBehavior { 00182 // fundamental behaviors 00183 eParse_None = 0x0000, ///< no special behavior 00184 eParse_StripComments = 0x0001, ///< strip comments (start with #) 00185 eParse_StripBogus = 0x0002, ///< strip bogus characters ('\\r') 00186 eParse_RespectQuotes = 0x0004, ///< treat quoted tokens as a unit 00187 00188 // common combinations 00189 eParse_Strip = 0x0003, ///< strip comments + bogus 00190 00191 // must be last! 00192 eParse_Invalid = 0x8000 00193 }; 00194 00195 00196 /// Reads to the next newline or end of stream, and stashes all characters 00197 /// (except the final newline) in the output string. 00198 /// Caller can ask to have comments and/or bogus characters stripped out. 00199 /// returns a std::string containing the line. throws on errors 00200 /// This routine is UTF-8 compliant. 00201 std::string getNextLineFromStream(IN std::istream& stream, 00202 IN eParseBehavior behavior); 00203 00204 00205 /// finds the next whitespace-delimited token, and puts it in the given token 00206 /// (std::string). 00207 /// returns a pointer to the character (whitespace or end of string) right 00208 /// after the token. 00209 /// This routine is UTF-8 compliant. 00210 const char * getNextTokenFromString(IN const char * input, 00211 OUT std::string& token, 00212 IN eParseBehavior); 00213 00214 00215 /// finds the next whitespace-delimited token, and puts it in the given 00216 /// buffer. 00217 /// returns a pointer to the character right after the token 00218 /// the chars parameter returns how many characters are in the buffer. 00219 /// the buffer is null-terminated 00220 /// if the buffer wasn't big enough for the token, the token will be 00221 /// truncated. The buffer will be null-terminated (the routine 00222 /// saves enough room for the final null). chars will contain the 00223 /// number of characters parsed from the string, even if not all of 00224 /// them were copied into the buffer. So you can detect truncation 00225 /// if chars >= buffer_size. 00226 /// This routine is UTF-8 compliant 00227 const char * getNextTokenFromString(IN const char * input, 00228 IO char * buffer, 00229 IN int buffer_size, 00230 IN eParseBehavior, 00231 OUT int& chars) throw(); 00232 00233 00234 /// expects the given token to be next in the string. Throws if the 00235 /// next token is something else. Returns a pointer to the character 00236 /// immediately following the expected token. 00237 const char * expectFromString(IN const char * input, 00238 IN const char * expect, 00239 IN eParseBehavior); 00240 00241 00242 /// helper method to extract a float (assuming it is the next token) 00243 /// This routine is UTF-8 compliant 00244 const char * getFloatFromString(IN const char * input, 00245 OUT float& x) throw(); 00246 00247 00248 /// helper method to read up to N floats. Returns the number read. 00249 /// Assumes that that caller doesn't need the string returned! 00250 /// The caller must provide a pre-allocated array of floats for 00251 /// output. 00252 int readFloatsFromString(IN const char * input, 00253 IN int nFloats, 00254 OUT float * output) throw(); 00255 00256 00257 /// tries to determine if the given string is a true or false value. 00258 /// This is NOT localized. This is for machine-parsed input only, 00259 /// such as config files. A string is considered false if it is 00260 /// the word "false" (any case), the single letter "F", the 00261 /// character "0", or empty. Any other value is considered true. 00262 /// Note that expressions that evaluate to zero but are not a 00263 /// single digit, such as "-0" or "+0", are considered true. 00264 bool getBooleanFromString(IN const char * input) throw(); 00265 00266 00267 /// true if the given character is bogus ('\\r', etc) 00268 bool isBogus(IN char a) throw(); 00269 00270 00271 /// dictionary routines 00272 /// These let you construct a dictionary (key -> value map) based on an input 00273 /// string, and then look for required/optional values. 00274 typedef Dictionary dictionary_t; // map of key->value pairs 00275 00276 void getDictionaryFromString(IN const char * string, 00277 IN const char * debug_info, 00278 OUT dictionary_t& data); 00279 00280 const char * getValue(IN const dictionary_t&, IN const char * key); 00281 const char * getRequiredValue(IN const dictionary_t&, IN const char * key); 00282 const char * getOptionalValue(IN const dictionary_t&, IN const char * key, 00283 IN const char * default_value) throw(); 00284 00285 00286 /// inplace conversion to lower case (returns pointer to beginning of string) 00287 //char * lowercase(char * text) throw(); 00288 00289 #endif // WAVEPACKET_PARSING_H__ 00290