parsing.cpp

Go to the documentation of this file.
00001 /*
00002  * parsing.cpp
00003  *
00004  * Copyright (C) 2007,2010  Thomas A. Vaughan
00005  * All rights reserved.
00006  *
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions are met:
00010  *     * Redistributions of source code must retain the above copyright
00011  *       notice, this list of conditions and the following disclaimer.
00012  *     * Redistributions in binary form must reproduce the above copyright
00013  *       notice, this list of conditions and the following disclaimer in the
00014  *       documentation and/or other materials provided with the distribution.
00015  *     * Neither the name of the <organization> nor the
00016  *       names of its contributors may be used to endorse or promote products
00017  *       derived from this software without specific prior written permission.
00018  *
00019  * THIS SOFTWARE IS PROVIDED BY THOMAS A. VAUGHAN ''AS IS'' AND ANY
00020  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
00021  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
00022  * DISCLAIMED. IN NO EVENT SHALL THOMAS A. VAUGHAN BE LIABLE FOR ANY
00023  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
00024  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00025  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
00026  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00027  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00028  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029  *
00030  *
00031  * Implementation of routines in parsing.h
00032  */
00033 
00034 // includes --------------------------------------------------------------------
00035 #include "parsing.h"            // always include our own header first!
00036 
00037 #include "common/wave_ex.h"     // exceptions
00038 
00039 
00040 
00041 static const byte_t s_maskHigh          = 0x80; // high bit set
00042 static const byte_t s_maskTwoHigh       = 0xC0; // both high bits set
00043 
00044 
00045 ////////////////////////////////////////////////////////////////////////////////
00046 //
00047 //      static helper methods
00048 //
00049 ////////////////////////////////////////////////////////////////////////////////
00050 
00051 
00052 ////////////////////////////////////////////////////////////////////////////////
00053 //
00054 //      public API
00055 //
00056 ////////////////////////////////////////////////////////////////////////////////
00057 
00058 void
00059 utf8_char_t::setToReplacement
00060 (
00061 void
00062 )
00063 throw()
00064 {
00065         // DPRINTF("Setting replacement!");
00066         // replacement character: Unicode 0xFFFD
00067         // That's 3 bytes in UTF-8
00068         nBytes = 3;
00069         value[0] = (char) 0xEF;
00070         value[1] = (char) 0xBF;
00071         value[2] = (char) 0xBD;
00072         value[3] = 0;
00073 }
00074 
00075 
00076 
00077 int
00078 utf8_char_t::getValue
00079 (
00080 void
00081 )
00082 const
00083 throw()
00084 {
00085         if (1 == nBytes) {
00086                 return value[0];
00087         } else if (nBytes > 1 && nBytes <= eMaxChars) {
00088                 int nBits = 7 - nBytes;
00089                 int retval = 0;
00090                 for (int i = 0; i < nBytes; ++i) {
00091         //              DPRINTF("  byte[%d] = 0x%02x", i, (byte_t) value[i]);
00092                         int mask = 1 << (nBits - 1);
00093                         for (int j = 0; j < nBits; ++j) {
00094                                 retval *= 2;
00095                                 if (((byte_t) value[i]) & mask) {
00096                                         retval += 1;
00097                                 }
00098                                 mask /= 2;
00099                         }
00100                         nBits = 6;
00101                 }
00102                 return retval;
00103         } else {
00104                 ASSERT(false, "Bad byte count for utf-8 character: %d", nBytes);
00105         }
00106 
00107         ASSERT(false, "should never get here!");
00108         return 0;
00109 }
00110 
00111 
00112 
00113 void
00114 utf8_char_t::dump
00115 (
00116 IN const char * title
00117 )
00118 const
00119 throw()
00120 {
00121         ASSERT(title, "null");
00122 
00123         DPRINTF("%s (utf-8 char): %d bytes value=0x%04x '%s'",
00124                 title, nBytes, getValue(), value);
00125 }
00126 
00127 
00128 
00129 bool
00130 getUTF8CharacterFromStream
00131 (
00132 IN std::istream& stream,
00133 OUT utf8_char_t& c
00134 )
00135 {
00136         ASSERT_THROW(stream.good(), "bad stream reading UTF8 character");
00137         c.clear();
00138 
00139         // get the first character
00140         char a;
00141         stream.read(&a, 1);
00142         if (stream.eof()) {
00143                 return false;
00144         }
00145 
00146         // valid first character?
00147         if (isLeadingUTF8Byte(a)) {
00148                 c.value[0] = a;
00149                 c.nBytes = getUTF8ByteCount(a);
00150                 if (c.nBytes < 1) {
00151                         c.setToReplacement();
00152                         return true;
00153                 }
00154         } else {
00155                 c.setToReplacement();
00156                 return true;
00157         }
00158 
00159 //      DPRINTF("Character 0x%02x has byte count: %d", a, c.nBytes);
00160         for (int i = 1; i < c.nBytes; ++i) {
00161                 ASSERT_THROW(stream.good(),
00162                     "bad stream reading next byte of UTF8 character");
00163                 stream.read(&a, 1);
00164                 if (!isTrailingUTF8Byte(a)) {
00165                         c.setToReplacement();
00166                         return true;
00167                 }
00168                 c.value[i] = a;
00169         }
00170         c.value[c.nBytes] = 0;
00171         return true;
00172 }
00173 
00174 
00175 
00176 const char *
00177 getUTF8CharacterFromString
00178 (
00179 IN const char * input,
00180 OUT utf8_char_t& c
00181 )
00182 {
00183         ASSERT(input, "null");
00184         c.clear();
00185 
00186         // get the first character
00187         char a = *input;
00188         ++input;
00189 
00190         if (isLeadingUTF8Byte(a)) {
00191                 c.value[0] = a;
00192                 c.nBytes = getUTF8ByteCount(a);
00193                 if (c.nBytes < 1) {
00194                         c.setToReplacement();
00195                         return input;
00196                 }
00197         } else {
00198                 c.setToReplacement();
00199                 return input;
00200         }
00201 
00202 //      DPRINTF("Character 0x%02x has byte count: %d", a, c.nBytes);
00203         for (int i = 1; i < c.nBytes; ++i) {
00204                 char a = *input;
00205                 ++input;
00206                 if (!isTrailingUTF8Byte(a)) {
00207                         c.setToReplacement();
00208                         return input;
00209                 }
00210                 c.value[i] = a;
00211         }
00212         c.value[c.nBytes] = 0;
00213 
00214         // all done--returning new head of string
00215         return input;
00216 }
00217 
00218 
00219 
00220 int
00221 getUTF8ByteCount
00222 (
00223 IN char a
00224 )
00225 {
00226         byte_t b = (byte_t) a;
00227 
00228         // first, a quick test: if high bit isn't set, this is ASCII
00229         if (!(s_maskHigh & b))
00230                 return 1;
00231 
00232         // if high bit is set, both high bits should be set!
00233         ASSERT_THROW((s_maskTwoHigh & b) == s_maskTwoHigh,
00234                 "bad leading UTF8 character: " << (int) b);
00235 
00236         // okay, look at number of set high bits
00237         byte_t mask = 0x20;     // third bit set
00238         int count = 2;
00239         while (mask & b) {
00240                 mask /= 2;
00241                 ++count;
00242         }
00243         if (count > utf8_char_t::eMaxChars) {
00244                 return -1;
00245         }
00246 
00247         return count;
00248 }
00249 
00250 
00251 
00252 std::string
00253 getNextLineFromStream
00254 (
00255 IN std::istream& stream,
00256 IN eParseBehavior behavior
00257 )
00258 {
00259         ASSERT(stream.good(), "bad stream?");
00260         ASSERT(!(eParse_Invalid & behavior), "bad behavior");
00261 
00262         // DPRINTF("Parsing line...");
00263 
00264         std::string output;
00265 
00266         bool in_quote = false;
00267         bool in_comment = false;
00268 
00269         while (true) {
00270                 if (stream.fail() || !stream.good()) {
00271                         WAVE_EX(wex);
00272                         wex << "Failure reading line from stream.";
00273                 }
00274 
00275                 char a;
00276                 stream.read(&a, 1);
00277                 if (stream.eof())
00278                         break;  // end of stream!
00279 
00280                 if ('\n' == a)
00281                         break;  // end of line!
00282 
00283                 if ((eParse_RespectQuotes & behavior) &&
00284                     '\"' == a)
00285                         in_quote = !in_quote;   // toggle
00286 
00287                 if (!in_quote &&
00288                     '#' == a &&
00289                     (eParse_StripComments & behavior))
00290                         in_comment = true;      // hit a comment
00291 
00292                 if (!in_comment) {
00293                         // not in a comment--should we add this?
00294                         if (!(eParse_StripBogus & behavior) ||
00295                             !isBogus(a)) {
00296                                 output += a;
00297                         }
00298                 }
00299         }
00300 
00301         return output;
00302 }
00303 
00304 
00305 
00306 const char *
00307 getNextTokenFromString
00308 (
00309 IN const char * input,
00310 OUT std::string& token,
00311 IN eParseBehavior behavior
00312 )
00313 {
00314         ASSERT(input, "null input buffer");
00315         ASSERT(!(eParse_Invalid & behavior), "invalid behavior");
00316         token.clear();
00317 
00318         // DPRINTF("Parsing line for next token: '%s'...", input);
00319 
00320         const char * p = input;
00321 
00322         // skip leading whitespace (safe for UTF-8!)
00323         while (*p && isSpace(*p)) {
00324                 ++p;
00325         }
00326 
00327         // is the first character a quotes?
00328         if ((eParse_RespectQuotes & behavior) && '\"' == *p) {
00329                 ++p;    // go to next character
00330                 while (*p && '\"' != *p) {
00331                         token += *p;
00332                         ++p;
00333                 }
00334                 if ('"' == *p) {
00335                         ++p;    // skip the closing quotes
00336                 } else {
00337                         DPRINTF("WARNING unterminated quote token: %s",
00338                             token.c_str());
00339                 }
00340         } else {
00341                 // NOT in quotes
00342                 // keep adding to token until hit end or buffer
00343                 while (*p && !isSpace(*p)) {
00344                         token += *p;
00345                         ++p;
00346                 }
00347         }
00348 
00349         // if we still have non-whitespace characters, traverse
00350         while (*p && !isSpace(*p)) {
00351                 ++p;
00352         }
00353 
00354         // DPRINTF("  extracted token = '%s'", token.c_str());
00355 
00356         // return new position
00357         return p;
00358 }
00359 
00360 
00361 
00362 const char *
00363 getNextTokenFromString
00364 (
00365 IN const char * input,
00366 IO char * buffer,
00367 IN int buffer_size,
00368 IN eParseBehavior behavior,
00369 OUT int& chars
00370 )
00371 throw()
00372 {
00373         ASSERT(input, "null");
00374         ASSERT(buffer, "null");
00375         ASSERT(buffer_size > 0, "Bad buffer size: %d", buffer_size);
00376         ASSERT(eParse_None == behavior, "unsupported parse behavior!");
00377         chars = 0;
00378 
00379         const char * p = input;
00380 
00381         // skip leading whitespace
00382         for (; *p && isSpace(*p); ++p) { }
00383 
00384         // read input
00385         for (; *p && !isSpace(*p); ++p) {
00386                 if (chars < buffer_size) {
00387                         buffer[chars] = *p;
00388                 }
00389                 ++chars;
00390         }
00391 
00392         // null terminate
00393         if (chars < buffer_size) {
00394                 buffer[chars] = 0;
00395         } else {
00396                 buffer[buffer_size - 1] = 0;
00397         }
00398 
00399         // all done
00400         return p;
00401 }
00402 
00403 
00404 
00405 const char *
00406 expectFromString
00407 (
00408 IN const char * input,
00409 IN const char * expect,
00410 IN eParseBehavior parse
00411 )
00412 {
00413         ASSERT(input, "null");
00414         ASSERT(expect, "null");
00415 
00416         // skip any whitespace
00417         while (*input && isSpace(*input)) {
00418                 ++input;
00419         }
00420 
00421         // now everything should match
00422         const char * p = expect;
00423         while (*p) {
00424                 ASSERT_THROW(*input == *p,
00425                     "Expected token '" << expect << "' not found");
00426                 ++p;
00427                 ++input;
00428         }
00429 
00430         // should be end of input, or space
00431         ASSERT_THROW(!*input || isSpace(*input),
00432             "Expected token '" << expect << "' has trailing characters");
00433 
00434         // must be good!
00435         return input;
00436 }
00437 
00438 
00439 
00440 bool
00441 isBogus
00442 (
00443 IN char a
00444 )
00445 throw()
00446 {
00447         return (a == '\r');
00448 }
00449 
00450 
00451 
00452 char *
00453 stripComments
00454 (
00455 IN char * line
00456 )
00457 throw()
00458 {
00459         ASSERT(line, "null");
00460 
00461         bool in_quotes = false;
00462         char * p = line;
00463         while (*p) {
00464                 if ('\"' == *p) {
00465                         in_quotes = !in_quotes;
00466                 } else if (!in_quotes && '#' == *p) {
00467                         *p = 0;
00468                         return line;
00469                 }
00470                 ++p;
00471         }
00472 
00473         // no comments encountered
00474         return line;
00475 }
00476 
00477 
00478 
00479 char *
00480 stripBogus
00481 (
00482 IN char * line
00483 )
00484 throw()
00485 {
00486         char * src = line;
00487         char * dst = line;
00488 
00489         while (*src) {
00490                 if (!isBogus(*src)) {
00491                         *dst = *src;
00492                         ++dst;
00493                 }
00494                 ++src;
00495         }
00496         *dst = 0;
00497         return line;
00498 }
00499 
00500 
00501 
00502 char *
00503 lowercase
00504 (
00505 IN char * wext
00506 )
00507 throw()
00508 {
00509         ASSERT(wext, "null");
00510 
00511         char * p = wext;
00512         while (*p) {
00513                 *p = tolower(*p);
00514                 ++p;
00515         }
00516 
00517         return wext;
00518 }
00519 
00520 
00521 
00522 void
00523 getDictionaryFromString
00524 (
00525 IN const char * cursor,
00526 IN const char * info,
00527 OUT dictionary_t& dict
00528 )
00529 {
00530         ASSERT(cursor, "null string passed to getDictionaryFromString()");
00531         ASSERT(info, "null debug info passed to getDictionaryFromString()");
00532         dict.clear();
00533 
00534         std::string key, value;
00535 
00536         // keep looping through pairs of tokens
00537         while (true) {
00538                 cursor = getNextTokenFromString(cursor, key, eParse_None);
00539                 if ("" == key) {
00540                         break;  // no more key/value pairs
00541                 }
00542 
00543                 // should only have a given key once
00544                 dictionary_t::iterator i = dict.find(key);
00545                 if (dict.end() != i) {
00546                         WAVE_EX(wex);
00547                         wex << "Invalid key/value string for '" << info;
00548                         wex << "'.  Key '" << key << "' appears more than once.";
00549                 }
00550 
00551                 cursor = getNextTokenFromString(cursor, value,
00552                     eParse_RespectQuotes);
00553                 dict[key] = value;
00554         }
00555 }
00556 
00557 
00558 
00559 const char *
00560 getValue
00561 (
00562 IN const dictionary_t& data,
00563 IN const char * key
00564 )
00565 {
00566         dictionary_t::const_iterator i = data.find(key);
00567         if (data.end() == i)
00568                 return NULL;
00569         return i->second.c_str();
00570 }
00571 
00572 
00573 
00574 const char *
00575 getRequiredValue
00576 (
00577 IN const dictionary_t& data,
00578 IN const char * key
00579 )
00580 {
00581         ASSERT(key, "null key passed in to getRequiredValue()");
00582 
00583         const char * value = getValue(data, key);
00584         if (!value) {
00585                 // couldn't find an entry with this key
00586                 // throw a std::exception
00587                 WAVE_EX(wex);
00588                 wex << "Missing a required parameter: '" << key << "'";
00589         }
00590 
00591         // found it!
00592         return value;
00593 }
00594 
00595 
00596 
00597 
00598 const char *
00599 getOptionalValue
00600 (
00601 IN const dictionary_t& data,
00602 IN const char * key,
00603 IN const char * default_value
00604 )
00605 throw()
00606 {
00607         ASSERT(key, "null key passed in to getOptionalValue()");
00608         ASSERT(default_value,
00609             "null default value passed in to getOptionalValue()");
00610 
00611         const char * value = getValue(data, key);
00612         return (value) ? value : default_value;
00613 }
00614 
00615 
00616 
00617 const char *
00618 getFloatFromString
00619 (
00620 IN const char * input,
00621 OUT float& x
00622 )
00623 throw()
00624 {
00625         ASSERT(input, "null");
00626         x = 0;
00627 
00628         static const int s_maxSize = 15;
00629         static const int s_bufSize = s_maxSize + 1;
00630         char buffer[s_bufSize];
00631 
00632         int nChars = 0;
00633         input = getNextTokenFromString(input, buffer, s_bufSize, eParse_None,
00634             nChars);
00635         if (nChars > s_maxSize) {
00636                 DPRINTF("Warning: float was truncated!  "
00637                     "(read %d out of %d characters)", s_maxSize, nChars);
00638         }
00639         x = atof(buffer);
00640 
00641         return input;
00642 }
00643 
00644 
00645 
00646 int
00647 readFloatsFromString
00648 (
00649 IN const char * input,
00650 IN int nFloats,
00651 OUT float * output
00652 )
00653 throw()
00654 {
00655         ASSERT(input, "null");
00656         ASSERT(nFloats > 0, "Bad float count: %d", nFloats);
00657         ASSERT(output, "null");
00658 
00659         for (int i = 0; i < nFloats; ++i, ++output) {
00660                 if (!*input) {
00661                         return i;
00662                 }
00663 
00664                 input = getFloatFromString(input, *output);
00665         }
00666 
00667         // read all requested!
00668         return nFloats;
00669 }
00670 
00671 
00672 
00673 bool
00674 getBooleanFromString
00675 (
00676 IN const char * input
00677 )
00678 throw()
00679 {
00680         ASSERT(input, "null");
00681 
00682         // empty string?  means false
00683         if (!*input)
00684                 return false;
00685 
00686         // single character string?
00687         if (!*(input + 1)) {
00688                 if ('F' == *input || 'f' == *input || '0' == *input)
00689                         return false;
00690                 return true;
00691         }
00692 
00693         // long string.  Only false value is the word "false"!
00694         return (0 != strcasecmp("false", input));
00695 }
00696 
00697 
00698 
00699 const char *
00700 getNextWord
00701 (
00702 IN const char * line,
00703 OUT std::string& word
00704 )
00705 {
00706         ASSERT(line, "null");
00707         word.clear();
00708 
00709         // UTF-8 compliant
00710         const char * p = line;
00711 
00712         // skip any leading whitespace
00713         for (; *p && '\n' != *p && isSpace(*p); ++p) { }
00714 
00715         // is this a newline?
00716         if ('\n' == *p) {
00717                 word = "\n";
00718                 return p + 1;
00719         }
00720 
00721         // add any non-whitespace characters, except newlines
00722         for (; *p && '\n' != *p && !isSpace(*p); ++p) {
00723                 word += *p;
00724         }
00725 
00726         return p;
00727 }