blob: 5435f3d60efc569d29a8663a739963fab4d85dea [file] [log] [blame]
Emeric Vigier2f625822012-08-06 11:09:52 -04001// Copyright (C) 1999-2005 Open Source Telecom Corporation.
2// Copyright (C) 2006-2010 David Sugar, Tycho Softworks.
3//
4// This program is free software; you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation; either version 2 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program; if not, write to the Free Software
16// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17//
18// As a special exception, you may use this file as part of a free software
19// library without restriction. Specifically, if other files instantiate
20// templates or use macros or inline functions from this file, or you compile
21// this file and link it with other files to produce an executable, this
22// file does not by itself cause the resulting executable to be covered by
23// the GNU General Public License. This exception does not however
24// invalidate any other reasons why the executable file might be covered by
25// the GNU General Public License.
26//
27// This exception applies only to the code released under the name GNU
28// Common C++. If you copy code from other releases into a copy of GNU
29// Common C++, as the General Public License permits, the exception does
30// not apply to the code that you add in this way. To avoid misleading
31// anyone as to the status of such modified files, you must delete
32// this exception notice from them.
33//
34// If you write modifications of your own for GNU Common C++, it is your choice
35// whether to permit this exception to apply to your modifications.
36// If you do not wish that, delete this exception notice.
37//
38
39/**
40 * @file tokenizer.h
41 * @short string tokenizer.
42 **/
43
44#ifndef CCXX_TOKENIZER_H_
45#define CCXX_TOKENIZER_H_
46
47#ifndef CCXX_MISSING_H_
48#include <cc++/missing.h>
49#endif
50
51#ifndef CCXX_THREAD_H_
52#include <cc++/thread.h>
53#endif
54
55#ifdef CCXX_NAMESPACES
56namespace ost {
57#endif
58
59/**
60 * Splits delimited string into tokens.
61 *
62 * The StringTokenizer takes a pointer to a string and a pointer
63 * to a string containing a number of possible delimiters.
64 * The StringTokenizer provides an input forward iterator which allows
65 * to iterate through all tokens. An iterator behaves like a logical
66 * pointer to the tokens, i.e. to shift to the next token, you've
67 * to increment the iterator, you get the token by dereferencing the
68 * iterator.
69 *
70 * Memory consumption:
71 * This class operates on the original string and only allocates memory
72 * for the individual tokens actually requested, so this class
73 * allocates at maximum the space required for the longest token in the
74 * given string.
75 * Since for each iteration, memory is reclaimed for the last token,
76 * you MAY NOT store pointers to them; if you need them afterwards,
77 * copy them. You may not modify the original string while you operate
78 * on it with the StringTokenizer; the behaviour is undefined in that
79 * case.
80 *
81 * The iterator has one special method 'nextDelimiter()' which returns
82 * a character containing the next delimiter following this
83 * tokenization process or '\\0', if there are no following delimiters. In
84 * case of skipAllDelim, it returns the FIRST delimiter.
85 *
86 * With the method 'setDelimiters(const char*)' you may change the
87 * set of delimiters. It affects all running iterators.
88 *
89 * Example:
90 * <code><pre>
91 * StringTokenizer st("mary had a little lamb;its fleece was..", " ;");
92 * StringTokenizer::iterator i;
93 * for (i = st.begin() ; i != st.end() ; ++i) {
94 * cout << "Token: '" << *i << "'\t";
95 * cout << " next Delim: '" << i.nextDelimiter() << "'" << endl;
96 * }
97 * </pre></code>
98 *
99 * @author Henner Zeller <H.Zeller@acm.org>
100 * @license LGPL
101 */
102class __EXPORT StringTokenizer {
103public:
104 /**
105 * a delimiter string containing all usual whitespace delimiters.
106 * These are space, tab, newline, carriage return,
107 * formfeed and vertical tab. (see isspace() manpage).
108 */
109 static const char * const SPACE;
110
111 /**
112 * Exception thrown, if someone tried to read beyond the
113 * end of the tokens.
114 * Will not happen if you use it the 'clean' way with comparison
115 * against end(), but if you skip some tokens, because you 'know'
116 * they are there. Simplifies error handling a lot, since you can
117 * just read your tokens the way you expect it, and if there is some
118 * error in the input this Exception will be thrown.
119 */
120 // maybe move more global ?
121 class NoSuchElementException { };
122
123 /**
124 * The input forward iterator for tokens.
125 * @author Henner Zeller
126 */
127 class __EXPORT iterator {
128 friend class StringTokenizer; // access our private constructors
129 private:
130 const StringTokenizer *myTok; // my StringTokenizer
131 const char *start; // start of current token
132 const char *tokEnd; // end of current token (->nxDelimiter)
133 const char *endp; // one before next token
134 char *token; // allocated token, if requested
135
136 // for initialization of the itEnd iterator
137 iterator(const StringTokenizer &tok, const char *end)
138 : myTok(&tok),tokEnd(0),endp(end),token(0) {}
139
140 iterator(const StringTokenizer &tok)
141 : myTok(&tok),tokEnd(0),endp(myTok->str-1),token(0) {
142 ++(*this); // init first token.
143 }
144
145 public:
146 iterator() : myTok(0),start(0),tokEnd(0),endp(0),token(0) {}
147
148 // see also: comment in implementation of operator++
149 virtual ~iterator()
150 { if (token) *token='\0'; delete [] token; }
151
152 /**
153 * copy constructor.
154 */
155 // everything, but not responsible for the allocated token.
156 iterator(const iterator& i) :
157 myTok(i.myTok),start(i.start),tokEnd(i.tokEnd),
158 endp(i.endp),token(0) {}
159
160 /**
161 * assignment operator.
162 */
163 // everything, but not responsible for the allocated token.
164 iterator &operator = (const iterator &i)
165 {
166 myTok = i.myTok;
167 start = i.start; endp = i.endp; tokEnd = i.tokEnd;
168 if ( token )
169 delete [] token;
170 token = 0;
171 return *this;
172 }
173
174 /**
175 * shifts this iterator to the next token in the string.
176 */
177 iterator &operator ++ () THROWS (NoSuchElementException);
178
179 /**
180 * returns the immutable string this iterator
181 * points to or '0' if no token is available (i.e.
182 * i == end()).
183 * Do not store pointers to this token, since it is
184 * invalidated for each iteration. If you need the token,
185 * copy it (e.g. with strdup());
186 */
187 const char* operator * () THROWS (NoSuchElementException);
188
189 /**
190 * returns the next delimiter after the current token or
191 * '\\0', if there are no following delimiters.
192 * It returns the very next delimiter (even if
193 * skipAllDelim=true).
194 */
195 inline char nextDelimiter() const
196 {return (tokEnd) ? *tokEnd : '\0';}
197
198 /**
199 * compares to other iterator. Usually used to
200 * compare against the end() iterator.
201 */
202 // only compare the end-position. speed.
203 inline bool operator == (const iterator &other) const
204 {return (endp == other.endp);}
205
206 /**
207 * compares to other iterator. Usually used to
208 * compare against the end() iterator.
209 */
210 // only compare the end position. speed.
211 inline bool operator != (const iterator &other) const
212 {return (endp != other.endp);}
213 };
214private:
215 friend class StringTokenizer::iterator;
216 const char *str;
217 const char *delim;
218 bool skipAll, trim;
219 iterator itEnd;
220
221public:
222 /**
223 * creates a new StringTokenizer for a string
224 * and a given set of delimiters.
225 *
226 * @param str String to be split up. This string will
227 * not be modified by this StringTokenizer,
228 * but you may as well not modfiy this string
229 * while tokenizing is in process, which may
230 * lead to undefined behaviour.
231 *
232 * @param delim String containing the characters
233 * which should be regarded as delimiters.
234 *
235 * @param skipAllDelim OPTIONAL.
236 * true, if subsequent
237 * delimiters should be skipped at once
238 * or false, if empty tokens should
239 * be returned for two delimiters with
240 * no other text inbetween. The first
241 * behaviour may be desirable for whitespace
242 * skipping, the second for input with
243 * delimited entry e.g. /etc/passwd like files
244 * or CSV input.
245 * NOTE, that 'true' here resembles the
246 * ANSI-C strtok(char *s,char *d) behaviour.
247 * DEFAULT = false
248 *
249 * @param trim OPTIONAL.
250 * true, if the tokens returned
251 * should be trimmed, so that they don't have
252 * any whitespaces at the beginning or end.
253 * Whitespaces are any of the characters
254 * defined in StringTokenizer::SPACE.
255 * If delim itself is StringTokenizer::SPACE,
256 * this will result in a behaviour with
257 * skipAllDelim = true.
258 * DEFAULT = false
259 */
260 StringTokenizer (const char *str,
261 const char *delim,
262 bool skipAllDelim = false,
263 bool trim = false);
264
265 /**
266 * create a new StringTokenizer which splits the input
267 * string at whitespaces. The tokens are stripped from
268 * whitespaces. This means, if you change the set of
269 * delimiters in either the 'begin(const char *delim)' method
270 * or in 'setDelimiters()', you then get whitespace
271 * trimmed tokens, delimited by the new set.
272 * Behaves like StringTokenizer(s, StringTokenizer::SPACE,false,true);
273 */
274 StringTokenizer (const char *s);
275
276 /**
277 * returns the begin iterator
278 */
279 iterator begin() const
280 {return iterator(*this);}
281
282 /**
283 * changes the set of delimiters used in subsequent
284 * iterations.
285 */
286 void setDelimiters (const char *d)
287 {delim = d;}
288
289 /**
290 * returns a begin iterator with an alternate set of
291 * delimiters.
292 */
293 iterator begin(const char *d)
294 {
295 delim = d;
296 return iterator(*this);
297 }
298
299 /**
300 * the iterator marking the end.
301 */
302 const iterator& end() const
303 {return itEnd;}
304};
305
306#ifdef CCXX_NAMESPACES
307}
308#endif
309
310#endif
311
312/** EMACS **
313 * Local variables:
314 * mode: c++
315 * c-basic-offset: 4
316 * End:
317 */