Alexandre Lision | ddd731e | 2014-01-31 11:50:08 -0500 | [diff] [blame^] | 1 | // Copyright (C) 1999-2005 Open Source Telecom Corporation. |
| 2 | // Copyright (C) 2006-2010 David Sugar, Tycho Softworks. |
| 3 | // |
| 4 | // This program is free software; you can redistribute it and/or modify |
| 5 | // it under the terms of the GNU General Public License as published by |
| 6 | // the Free Software Foundation; either version 2 of the License, or |
| 7 | // (at your option) any later version. |
| 8 | // |
| 9 | // This program is distributed in the hope that it will be useful, |
| 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 12 | // GNU General Public License for more details. |
| 13 | // |
| 14 | // You should have received a copy of the GNU General Public License |
| 15 | // along with this program; if not, write to the Free Software |
| 16 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| 17 | // |
| 18 | // As a special exception, you may use this file as part of a free software |
| 19 | // library without restriction. Specifically, if other files instantiate |
| 20 | // templates or use macros or inline functions from this file, or you compile |
| 21 | // this file and link it with other files to produce an executable, this |
| 22 | // file does not by itself cause the resulting executable to be covered by |
| 23 | // the GNU General Public License. This exception does not however |
| 24 | // invalidate any other reasons why the executable file might be covered by |
| 25 | // the GNU General Public License. |
| 26 | // |
| 27 | // This exception applies only to the code released under the name GNU |
| 28 | // Common C++. If you copy code from other releases into a copy of GNU |
| 29 | // Common C++, as the General Public License permits, the exception does |
| 30 | // not apply to the code that you add in this way. To avoid misleading |
| 31 | // anyone as to the status of such modified files, you must delete |
| 32 | // this exception notice from them. |
| 33 | // |
| 34 | // If you write modifications of your own for GNU Common C++, it is your choice |
| 35 | // whether to permit this exception to apply to your modifications. |
| 36 | // If you do not wish that, delete this exception notice. |
| 37 | // |
| 38 | |
| 39 | /** |
| 40 | * @file tokenizer.h |
| 41 | * @short string tokenizer. |
| 42 | **/ |
| 43 | |
| 44 | #ifndef COMMONCPP_TOKENIZER_H_ |
| 45 | #define COMMONCPP_TOKENIZER_H_ |
| 46 | |
| 47 | #ifndef COMMONCPP_CONFIG_H_ |
| 48 | #include <commoncpp/config.h> |
| 49 | #endif |
| 50 | |
| 51 | #ifndef COMMONCPP_THREAD_H_ |
| 52 | #include <commoncpp/thread.h> |
| 53 | #endif |
| 54 | |
| 55 | #ifndef COMMMONCPP_EXCEPTION_H_ |
| 56 | #include <commoncpp/exception.h> |
| 57 | #endif |
| 58 | |
| 59 | NAMESPACE_COMMONCPP |
| 60 | |
| 61 | /** |
| 62 | * Splits delimited string into tokens. |
| 63 | * |
| 64 | * The StringTokenizer takes a pointer to a string and a pointer |
| 65 | * to a string containing a number of possible delimiters. |
| 66 | * The StringTokenizer provides an input forward iterator which allows |
| 67 | * to iterate through all tokens. An iterator behaves like a logical |
| 68 | * pointer to the tokens, i.e. to shift to the next token, you've |
| 69 | * to increment the iterator, you get the token by dereferencing the |
| 70 | * iterator. |
| 71 | * |
| 72 | * Memory consumption: |
| 73 | * This class operates on the original string and only allocates memory |
| 74 | * for the individual tokens actually requested, so this class |
| 75 | * allocates at maximum the space required for the longest token in the |
| 76 | * given string. |
| 77 | * Since for each iteration, memory is reclaimed for the last token, |
| 78 | * you MAY NOT store pointers to them; if you need them afterwards, |
| 79 | * copy them. You may not modify the original string while you operate |
| 80 | * on it with the StringTokenizer; the behaviour is undefined in that |
| 81 | * case. |
| 82 | * |
| 83 | * The iterator has one special method 'nextDelimiter()' which returns |
| 84 | * a character containing the next delimiter following this |
| 85 | * tokenization process or '\\0', if there are no following delimiters. In |
| 86 | * case of skipAllDelim, it returns the FIRST delimiter. |
| 87 | * |
| 88 | * With the method 'setDelimiters(const char*)' you may change the |
| 89 | * set of delimiters. It affects all running iterators. |
| 90 | * |
| 91 | * Example: |
| 92 | * <code><pre> |
| 93 | * StringTokenizer st("mary had a little lamb;its fleece was..", " ;"); |
| 94 | * StringTokenizer::iterator i; |
| 95 | * for (i = st.begin() ; i != st.end() ; ++i) { |
| 96 | * cout << "Token: '" << *i << "'\t"; |
| 97 | * cout << " next Delim: '" << i.nextDelimiter() << "'" << endl; |
| 98 | * } |
| 99 | * </pre></code> |
| 100 | * |
| 101 | * @author Henner Zeller <H.Zeller@acm.org> |
| 102 | * @license LGPL |
| 103 | */ |
| 104 | class __EXPORT StringTokenizer { |
| 105 | public: |
| 106 | /** |
| 107 | * a delimiter string containing all usual whitespace delimiters. |
| 108 | * These are space, tab, newline, carriage return, |
| 109 | * formfeed and vertical tab. (see isspace() manpage). |
| 110 | */ |
| 111 | static const char * const SPACE; |
| 112 | |
| 113 | /** |
| 114 | * Exception thrown, if someone tried to read beyond the |
| 115 | * end of the tokens. |
| 116 | * Will not happen if you use it the 'clean' way with comparison |
| 117 | * against end(), but if you skip some tokens, because you 'know' |
| 118 | * they are there. Simplifies error handling a lot, since you can |
| 119 | * just read your tokens the way you expect it, and if there is some |
| 120 | * error in the input this Exception will be thrown. |
| 121 | */ |
| 122 | // maybe move more global ? |
| 123 | class NoSuchElementException { }; |
| 124 | |
| 125 | /** |
| 126 | * The input forward iterator for tokens. |
| 127 | * @author Henner Zeller |
| 128 | */ |
| 129 | class __EXPORT iterator { |
| 130 | friend class StringTokenizer; // access our private constructors |
| 131 | private: |
| 132 | const StringTokenizer *myTok; // my StringTokenizer |
| 133 | const char *start; // start of current token |
| 134 | const char *tokEnd; // end of current token (->nxDelimiter) |
| 135 | const char *endp; // one before next token |
| 136 | char *token; // allocated token, if requested |
| 137 | |
| 138 | // for initialization of the itEnd iterator |
| 139 | iterator(const StringTokenizer &tok, const char *end) |
| 140 | : myTok(&tok),tokEnd(0),endp(end),token(0) {} |
| 141 | |
| 142 | iterator(const StringTokenizer &tok) |
| 143 | : myTok(&tok),tokEnd(0),endp(myTok->str-1),token(0) { |
| 144 | ++(*this); // init first token. |
| 145 | } |
| 146 | |
| 147 | public: |
| 148 | iterator() : myTok(0),start(0),tokEnd(0),endp(0),token(0) {} |
| 149 | |
| 150 | // see also: comment in implementation of operator++ |
| 151 | virtual ~iterator() |
| 152 | { if (token) *token='\0'; delete [] token; } |
| 153 | |
| 154 | /** |
| 155 | * copy constructor. |
| 156 | */ |
| 157 | // everything, but not responsible for the allocated token. |
| 158 | iterator(const iterator& i) : |
| 159 | myTok(i.myTok),start(i.start),tokEnd(i.tokEnd), |
| 160 | endp(i.endp),token(0) {} |
| 161 | |
| 162 | /** |
| 163 | * assignment operator. |
| 164 | */ |
| 165 | // everything, but not responsible for the allocated token. |
| 166 | iterator &operator = (const iterator &i) |
| 167 | { |
| 168 | myTok = i.myTok; |
| 169 | start = i.start; endp = i.endp; tokEnd = i.tokEnd; |
| 170 | if ( token ) |
| 171 | delete [] token; |
| 172 | token = 0; |
| 173 | return *this; |
| 174 | } |
| 175 | |
| 176 | /** |
| 177 | * shifts this iterator to the next token in the string. |
| 178 | */ |
| 179 | iterator &operator ++ () THROWS (NoSuchElementException); |
| 180 | |
| 181 | /** |
| 182 | * returns the immutable string this iterator |
| 183 | * points to or '0' if no token is available (i.e. |
| 184 | * i == end()). |
| 185 | * Do not store pointers to this token, since it is |
| 186 | * invalidated for each iteration. If you need the token, |
| 187 | * copy it (e.g. with strdup()); |
| 188 | */ |
| 189 | const char* operator * () THROWS (NoSuchElementException); |
| 190 | |
| 191 | /** |
| 192 | * returns the next delimiter after the current token or |
| 193 | * '\\0', if there are no following delimiters. |
| 194 | * It returns the very next delimiter (even if |
| 195 | * skipAllDelim=true). |
| 196 | */ |
| 197 | inline char nextDelimiter() const |
| 198 | {return (tokEnd) ? *tokEnd : '\0';} |
| 199 | |
| 200 | /** |
| 201 | * compares to other iterator. Usually used to |
| 202 | * compare against the end() iterator. |
| 203 | */ |
| 204 | // only compare the end-position. speed. |
| 205 | inline bool operator == (const iterator &other) const |
| 206 | {return (endp == other.endp);} |
| 207 | |
| 208 | /** |
| 209 | * compares to other iterator. Usually used to |
| 210 | * compare against the end() iterator. |
| 211 | */ |
| 212 | // only compare the end position. speed. |
| 213 | inline bool operator != (const iterator &other) const |
| 214 | {return (endp != other.endp);} |
| 215 | }; |
| 216 | private: |
| 217 | friend class StringTokenizer::iterator; |
| 218 | const char *str; |
| 219 | const char *delim; |
| 220 | bool skipAll, trim; |
| 221 | iterator itEnd; |
| 222 | |
| 223 | public: |
| 224 | /** |
| 225 | * creates a new StringTokenizer for a string |
| 226 | * and a given set of delimiters. |
| 227 | * |
| 228 | * @param str String to be split up. This string will |
| 229 | * not be modified by this StringTokenizer, |
| 230 | * but you may as well not modfiy this string |
| 231 | * while tokenizing is in process, which may |
| 232 | * lead to undefined behaviour. |
| 233 | * |
| 234 | * @param delim String containing the characters |
| 235 | * which should be regarded as delimiters. |
| 236 | * |
| 237 | * @param skipAllDelim OPTIONAL. |
| 238 | * true, if subsequent |
| 239 | * delimiters should be skipped at once |
| 240 | * or false, if empty tokens should |
| 241 | * be returned for two delimiters with |
| 242 | * no other text inbetween. The first |
| 243 | * behaviour may be desirable for whitespace |
| 244 | * skipping, the second for input with |
| 245 | * delimited entry e.g. /etc/passwd like files |
| 246 | * or CSV input. |
| 247 | * NOTE, that 'true' here resembles the |
| 248 | * ANSI-C strtok(char *s,char *d) behaviour. |
| 249 | * DEFAULT = false |
| 250 | * |
| 251 | * @param trim OPTIONAL. |
| 252 | * true, if the tokens returned |
| 253 | * should be trimmed, so that they don't have |
| 254 | * any whitespaces at the beginning or end. |
| 255 | * Whitespaces are any of the characters |
| 256 | * defined in StringTokenizer::SPACE. |
| 257 | * If delim itself is StringTokenizer::SPACE, |
| 258 | * this will result in a behaviour with |
| 259 | * skipAllDelim = true. |
| 260 | * DEFAULT = false |
| 261 | */ |
| 262 | StringTokenizer (const char *str, |
| 263 | const char *delim, |
| 264 | bool skipAllDelim = false, |
| 265 | bool trim = false); |
| 266 | |
| 267 | /** |
| 268 | * create a new StringTokenizer which splits the input |
| 269 | * string at whitespaces. The tokens are stripped from |
| 270 | * whitespaces. This means, if you change the set of |
| 271 | * delimiters in either the 'begin(const char *delim)' method |
| 272 | * or in 'setDelimiters()', you then get whitespace |
| 273 | * trimmed tokens, delimited by the new set. |
| 274 | * Behaves like StringTokenizer(s, StringTokenizer::SPACE,false,true); |
| 275 | */ |
| 276 | StringTokenizer (const char *s); |
| 277 | |
| 278 | /** |
| 279 | * returns the begin iterator |
| 280 | */ |
| 281 | iterator begin() const |
| 282 | {return iterator(*this);} |
| 283 | |
| 284 | /** |
| 285 | * changes the set of delimiters used in subsequent |
| 286 | * iterations. |
| 287 | */ |
| 288 | void setDelimiters (const char *d) |
| 289 | {delim = d;} |
| 290 | |
| 291 | /** |
| 292 | * returns a begin iterator with an alternate set of |
| 293 | * delimiters. |
| 294 | */ |
| 295 | iterator begin(const char *d) |
| 296 | { |
| 297 | delim = d; |
| 298 | return iterator(*this); |
| 299 | } |
| 300 | |
| 301 | /** |
| 302 | * the iterator marking the end. |
| 303 | */ |
| 304 | const iterator& end() const |
| 305 | {return itEnd;} |
| 306 | }; |
| 307 | |
| 308 | END_NAMESPACE |
| 309 | |
| 310 | #endif |
| 311 | |
| 312 | /** EMACS ** |
| 313 | * Local variables: |
| 314 | * mode: c++ |
| 315 | * c-basic-offset: 4 |
| 316 | * End: |
| 317 | */ |