Emeric Vigier | 2f62582 | 2012-08-06 11:09:52 -0400 | [diff] [blame] | 1 | // Copyright (C) 1999-2005 Open Source Telecom Corporation. |
| 2 | // Copyright (C) 2006-2010 David Sugar, Tycho Softworks. |
| 3 | // |
| 4 | // This program is free software; you can redistribute it and/or modify |
| 5 | // it under the terms of the GNU General Public License as published by |
| 6 | // the Free Software Foundation; either version 2 of the License, or |
| 7 | // (at your option) any later version. |
| 8 | // |
| 9 | // This program is distributed in the hope that it will be useful, |
| 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 12 | // GNU General Public License for more details. |
| 13 | // |
| 14 | // You should have received a copy of the GNU General Public License |
| 15 | // along with this program; if not, write to the Free Software |
| 16 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| 17 | // |
| 18 | // As a special exception, you may use this file as part of a free software |
| 19 | // library without restriction. Specifically, if other files instantiate |
| 20 | // templates or use macros or inline functions from this file, or you compile |
| 21 | // this file and link it with other files to produce an executable, this |
| 22 | // file does not by itself cause the resulting executable to be covered by |
| 23 | // the GNU General Public License. This exception does not however |
| 24 | // invalidate any other reasons why the executable file might be covered by |
| 25 | // the GNU General Public License. |
| 26 | // |
| 27 | // This exception applies only to the code released under the name GNU |
| 28 | // Common C++. If you copy code from other releases into a copy of GNU |
| 29 | // Common C++, as the General Public License permits, the exception does |
| 30 | // not apply to the code that you add in this way. To avoid misleading |
| 31 | // anyone as to the status of such modified files, you must delete |
| 32 | // this exception notice from them. |
| 33 | // |
| 34 | // If you write modifications of your own for GNU Common C++, it is your choice |
| 35 | // whether to permit this exception to apply to your modifications. |
| 36 | // If you do not wish that, delete this exception notice. |
| 37 | // |
| 38 | |
| 39 | /** |
| 40 | * @file tokenizer.h |
| 41 | * @short string tokenizer. |
| 42 | **/ |
| 43 | |
| 44 | #ifndef CCXX_TOKENIZER_H_ |
| 45 | #define CCXX_TOKENIZER_H_ |
| 46 | |
| 47 | #ifndef CCXX_MISSING_H_ |
| 48 | #include <cc++/missing.h> |
| 49 | #endif |
| 50 | |
| 51 | #ifndef CCXX_THREAD_H_ |
| 52 | #include <cc++/thread.h> |
| 53 | #endif |
| 54 | |
| 55 | #ifdef CCXX_NAMESPACES |
| 56 | namespace ost { |
| 57 | #endif |
| 58 | |
| 59 | /** |
| 60 | * Splits delimited string into tokens. |
| 61 | * |
| 62 | * The StringTokenizer takes a pointer to a string and a pointer |
| 63 | * to a string containing a number of possible delimiters. |
| 64 | * The StringTokenizer provides an input forward iterator which allows |
| 65 | * to iterate through all tokens. An iterator behaves like a logical |
| 66 | * pointer to the tokens, i.e. to shift to the next token, you've |
| 67 | * to increment the iterator, you get the token by dereferencing the |
| 68 | * iterator. |
| 69 | * |
| 70 | * Memory consumption: |
| 71 | * This class operates on the original string and only allocates memory |
| 72 | * for the individual tokens actually requested, so this class |
| 73 | * allocates at maximum the space required for the longest token in the |
| 74 | * given string. |
| 75 | * Since for each iteration, memory is reclaimed for the last token, |
| 76 | * you MAY NOT store pointers to them; if you need them afterwards, |
| 77 | * copy them. You may not modify the original string while you operate |
| 78 | * on it with the StringTokenizer; the behaviour is undefined in that |
| 79 | * case. |
| 80 | * |
| 81 | * The iterator has one special method 'nextDelimiter()' which returns |
| 82 | * a character containing the next delimiter following this |
| 83 | * tokenization process or '\\0', if there are no following delimiters. In |
| 84 | * case of skipAllDelim, it returns the FIRST delimiter. |
| 85 | * |
| 86 | * With the method 'setDelimiters(const char*)' you may change the |
| 87 | * set of delimiters. It affects all running iterators. |
| 88 | * |
| 89 | * Example: |
| 90 | * <code><pre> |
| 91 | * StringTokenizer st("mary had a little lamb;its fleece was..", " ;"); |
| 92 | * StringTokenizer::iterator i; |
| 93 | * for (i = st.begin() ; i != st.end() ; ++i) { |
| 94 | * cout << "Token: '" << *i << "'\t"; |
| 95 | * cout << " next Delim: '" << i.nextDelimiter() << "'" << endl; |
| 96 | * } |
| 97 | * </pre></code> |
| 98 | * |
| 99 | * @author Henner Zeller <H.Zeller@acm.org> |
| 100 | * @license LGPL |
| 101 | */ |
| 102 | class __EXPORT StringTokenizer { |
| 103 | public: |
| 104 | /** |
| 105 | * a delimiter string containing all usual whitespace delimiters. |
| 106 | * These are space, tab, newline, carriage return, |
| 107 | * formfeed and vertical tab. (see isspace() manpage). |
| 108 | */ |
| 109 | static const char * const SPACE; |
| 110 | |
| 111 | /** |
| 112 | * Exception thrown, if someone tried to read beyond the |
| 113 | * end of the tokens. |
| 114 | * Will not happen if you use it the 'clean' way with comparison |
| 115 | * against end(), but if you skip some tokens, because you 'know' |
| 116 | * they are there. Simplifies error handling a lot, since you can |
| 117 | * just read your tokens the way you expect it, and if there is some |
| 118 | * error in the input this Exception will be thrown. |
| 119 | */ |
| 120 | // maybe move more global ? |
| 121 | class NoSuchElementException { }; |
| 122 | |
| 123 | /** |
| 124 | * The input forward iterator for tokens. |
| 125 | * @author Henner Zeller |
| 126 | */ |
| 127 | class __EXPORT iterator { |
| 128 | friend class StringTokenizer; // access our private constructors |
| 129 | private: |
| 130 | const StringTokenizer *myTok; // my StringTokenizer |
| 131 | const char *start; // start of current token |
| 132 | const char *tokEnd; // end of current token (->nxDelimiter) |
| 133 | const char *endp; // one before next token |
| 134 | char *token; // allocated token, if requested |
| 135 | |
| 136 | // for initialization of the itEnd iterator |
| 137 | iterator(const StringTokenizer &tok, const char *end) |
| 138 | : myTok(&tok),tokEnd(0),endp(end),token(0) {} |
| 139 | |
| 140 | iterator(const StringTokenizer &tok) |
| 141 | : myTok(&tok),tokEnd(0),endp(myTok->str-1),token(0) { |
| 142 | ++(*this); // init first token. |
| 143 | } |
| 144 | |
| 145 | public: |
| 146 | iterator() : myTok(0),start(0),tokEnd(0),endp(0),token(0) {} |
| 147 | |
| 148 | // see also: comment in implementation of operator++ |
| 149 | virtual ~iterator() |
| 150 | { if (token) *token='\0'; delete [] token; } |
| 151 | |
| 152 | /** |
| 153 | * copy constructor. |
| 154 | */ |
| 155 | // everything, but not responsible for the allocated token. |
| 156 | iterator(const iterator& i) : |
| 157 | myTok(i.myTok),start(i.start),tokEnd(i.tokEnd), |
| 158 | endp(i.endp),token(0) {} |
| 159 | |
| 160 | /** |
| 161 | * assignment operator. |
| 162 | */ |
| 163 | // everything, but not responsible for the allocated token. |
| 164 | iterator &operator = (const iterator &i) |
| 165 | { |
| 166 | myTok = i.myTok; |
| 167 | start = i.start; endp = i.endp; tokEnd = i.tokEnd; |
| 168 | if ( token ) |
| 169 | delete [] token; |
| 170 | token = 0; |
| 171 | return *this; |
| 172 | } |
| 173 | |
| 174 | /** |
| 175 | * shifts this iterator to the next token in the string. |
| 176 | */ |
| 177 | iterator &operator ++ () THROWS (NoSuchElementException); |
| 178 | |
| 179 | /** |
| 180 | * returns the immutable string this iterator |
| 181 | * points to or '0' if no token is available (i.e. |
| 182 | * i == end()). |
| 183 | * Do not store pointers to this token, since it is |
| 184 | * invalidated for each iteration. If you need the token, |
| 185 | * copy it (e.g. with strdup()); |
| 186 | */ |
| 187 | const char* operator * () THROWS (NoSuchElementException); |
| 188 | |
| 189 | /** |
| 190 | * returns the next delimiter after the current token or |
| 191 | * '\\0', if there are no following delimiters. |
| 192 | * It returns the very next delimiter (even if |
| 193 | * skipAllDelim=true). |
| 194 | */ |
| 195 | inline char nextDelimiter() const |
| 196 | {return (tokEnd) ? *tokEnd : '\0';} |
| 197 | |
| 198 | /** |
| 199 | * compares to other iterator. Usually used to |
| 200 | * compare against the end() iterator. |
| 201 | */ |
| 202 | // only compare the end-position. speed. |
| 203 | inline bool operator == (const iterator &other) const |
| 204 | {return (endp == other.endp);} |
| 205 | |
| 206 | /** |
| 207 | * compares to other iterator. Usually used to |
| 208 | * compare against the end() iterator. |
| 209 | */ |
| 210 | // only compare the end position. speed. |
| 211 | inline bool operator != (const iterator &other) const |
| 212 | {return (endp != other.endp);} |
| 213 | }; |
| 214 | private: |
| 215 | friend class StringTokenizer::iterator; |
| 216 | const char *str; |
| 217 | const char *delim; |
| 218 | bool skipAll, trim; |
| 219 | iterator itEnd; |
| 220 | |
| 221 | public: |
| 222 | /** |
| 223 | * creates a new StringTokenizer for a string |
| 224 | * and a given set of delimiters. |
| 225 | * |
| 226 | * @param str String to be split up. This string will |
| 227 | * not be modified by this StringTokenizer, |
| 228 | * but you may as well not modfiy this string |
| 229 | * while tokenizing is in process, which may |
| 230 | * lead to undefined behaviour. |
| 231 | * |
| 232 | * @param delim String containing the characters |
| 233 | * which should be regarded as delimiters. |
| 234 | * |
| 235 | * @param skipAllDelim OPTIONAL. |
| 236 | * true, if subsequent |
| 237 | * delimiters should be skipped at once |
| 238 | * or false, if empty tokens should |
| 239 | * be returned for two delimiters with |
| 240 | * no other text inbetween. The first |
| 241 | * behaviour may be desirable for whitespace |
| 242 | * skipping, the second for input with |
| 243 | * delimited entry e.g. /etc/passwd like files |
| 244 | * or CSV input. |
| 245 | * NOTE, that 'true' here resembles the |
| 246 | * ANSI-C strtok(char *s,char *d) behaviour. |
| 247 | * DEFAULT = false |
| 248 | * |
| 249 | * @param trim OPTIONAL. |
| 250 | * true, if the tokens returned |
| 251 | * should be trimmed, so that they don't have |
| 252 | * any whitespaces at the beginning or end. |
| 253 | * Whitespaces are any of the characters |
| 254 | * defined in StringTokenizer::SPACE. |
| 255 | * If delim itself is StringTokenizer::SPACE, |
| 256 | * this will result in a behaviour with |
| 257 | * skipAllDelim = true. |
| 258 | * DEFAULT = false |
| 259 | */ |
| 260 | StringTokenizer (const char *str, |
| 261 | const char *delim, |
| 262 | bool skipAllDelim = false, |
| 263 | bool trim = false); |
| 264 | |
| 265 | /** |
| 266 | * create a new StringTokenizer which splits the input |
| 267 | * string at whitespaces. The tokens are stripped from |
| 268 | * whitespaces. This means, if you change the set of |
| 269 | * delimiters in either the 'begin(const char *delim)' method |
| 270 | * or in 'setDelimiters()', you then get whitespace |
| 271 | * trimmed tokens, delimited by the new set. |
| 272 | * Behaves like StringTokenizer(s, StringTokenizer::SPACE,false,true); |
| 273 | */ |
| 274 | StringTokenizer (const char *s); |
| 275 | |
| 276 | /** |
| 277 | * returns the begin iterator |
| 278 | */ |
| 279 | iterator begin() const |
| 280 | {return iterator(*this);} |
| 281 | |
| 282 | /** |
| 283 | * changes the set of delimiters used in subsequent |
| 284 | * iterations. |
| 285 | */ |
| 286 | void setDelimiters (const char *d) |
| 287 | {delim = d;} |
| 288 | |
| 289 | /** |
| 290 | * returns a begin iterator with an alternate set of |
| 291 | * delimiters. |
| 292 | */ |
| 293 | iterator begin(const char *d) |
| 294 | { |
| 295 | delim = d; |
| 296 | return iterator(*this); |
| 297 | } |
| 298 | |
| 299 | /** |
| 300 | * the iterator marking the end. |
| 301 | */ |
| 302 | const iterator& end() const |
| 303 | {return itEnd;} |
| 304 | }; |
| 305 | |
| 306 | #ifdef CCXX_NAMESPACES |
| 307 | } |
| 308 | #endif |
| 309 | |
| 310 | #endif |
| 311 | |
| 312 | /** EMACS ** |
| 313 | * Local variables: |
| 314 | * mode: c++ |
| 315 | * c-basic-offset: 4 |
| 316 | * End: |
| 317 | */ |