blob: 0f076f89087d86ff0afb4fd48ad1cf0654adaee4 [file] [log] [blame]
// Copyright (C) 1999-2005 Open Source Telecom Corporation.
// Copyright (C) 2006-2010 David Sugar, Tycho Softworks.
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//
// As a special exception, you may use this file as part of a free software
// library without restriction. Specifically, if other files instantiate
// templates or use macros or inline functions from this file, or you compile
// this file and link it with other files to produce an executable, this
// file does not by itself cause the resulting executable to be covered by
// the GNU General Public License. This exception does not however
// invalidate any other reasons why the executable file might be covered by
// the GNU General Public License.
//
// This exception applies only to the code released under the name GNU
// Common C++. If you copy code from other releases into a copy of GNU
// Common C++, as the General Public License permits, the exception does
// not apply to the code that you add in this way. To avoid misleading
// anyone as to the status of such modified files, you must delete
// this exception notice from them.
//
// If you write modifications of your own for GNU Common C++, it is your choice
// whether to permit this exception to apply to your modifications.
// If you do not wish that, delete this exception notice.
//
/**
* @file tokenizer.h
* @short string tokenizer.
**/
#ifndef COMMONCPP_TOKENIZER_H_
#define COMMONCPP_TOKENIZER_H_
#ifndef COMMONCPP_CONFIG_H_
#include <commoncpp/config.h>
#endif
#ifndef COMMONCPP_THREAD_H_
#include <commoncpp/thread.h>
#endif
#ifndef COMMMONCPP_EXCEPTION_H_
#include <commoncpp/exception.h>
#endif
NAMESPACE_COMMONCPP
/**
* Splits delimited string into tokens.
*
* The StringTokenizer takes a pointer to a string and a pointer
* to a string containing a number of possible delimiters.
* The StringTokenizer provides an input forward iterator which allows
* to iterate through all tokens. An iterator behaves like a logical
* pointer to the tokens, i.e. to shift to the next token, you've
* to increment the iterator, you get the token by dereferencing the
* iterator.
*
* Memory consumption:
* This class operates on the original string and only allocates memory
* for the individual tokens actually requested, so this class
* allocates at maximum the space required for the longest token in the
* given string.
* Since for each iteration, memory is reclaimed for the last token,
* you MAY NOT store pointers to them; if you need them afterwards,
* copy them. You may not modify the original string while you operate
* on it with the StringTokenizer; the behaviour is undefined in that
* case.
*
* The iterator has one special method 'nextDelimiter()' which returns
* a character containing the next delimiter following this
* tokenization process or '\\0', if there are no following delimiters. In
* case of skipAllDelim, it returns the FIRST delimiter.
*
* With the method 'setDelimiters(const char*)' you may change the
* set of delimiters. It affects all running iterators.
*
* Example:
* <code><pre>
* StringTokenizer st("mary had a little lamb;its fleece was..", " ;");
* StringTokenizer::iterator i;
* for (i = st.begin() ; i != st.end() ; ++i) {
* cout << "Token: '" << *i << "'\t";
* cout << " next Delim: '" << i.nextDelimiter() << "'" << endl;
* }
* </pre></code>
*
* @author Henner Zeller <H.Zeller@acm.org>
* @license LGPL
*/
class __EXPORT StringTokenizer {
public:
/**
* a delimiter string containing all usual whitespace delimiters.
* These are space, tab, newline, carriage return,
* formfeed and vertical tab. (see isspace() manpage).
*/
static const char * const SPACE;
/**
* Exception thrown, if someone tried to read beyond the
* end of the tokens.
* Will not happen if you use it the 'clean' way with comparison
* against end(), but if you skip some tokens, because you 'know'
* they are there. Simplifies error handling a lot, since you can
* just read your tokens the way you expect it, and if there is some
* error in the input this Exception will be thrown.
*/
// maybe move more global ?
class NoSuchElementException { };
/**
* The input forward iterator for tokens.
* @author Henner Zeller
*/
class __EXPORT iterator {
friend class StringTokenizer; // access our private constructors
private:
const StringTokenizer *myTok; // my StringTokenizer
const char *start; // start of current token
const char *tokEnd; // end of current token (->nxDelimiter)
const char *endp; // one before next token
char *token; // allocated token, if requested
// for initialization of the itEnd iterator
iterator(const StringTokenizer &tok, const char *end)
: myTok(&tok),tokEnd(0),endp(end),token(0) {}
iterator(const StringTokenizer &tok)
: myTok(&tok),tokEnd(0),endp(myTok->str-1),token(0) {
++(*this); // init first token.
}
public:
iterator() : myTok(0),start(0),tokEnd(0),endp(0),token(0) {}
// see also: comment in implementation of operator++
virtual ~iterator()
{ if (token) *token='\0'; delete [] token; }
/**
* copy constructor.
*/
// everything, but not responsible for the allocated token.
iterator(const iterator& i) :
myTok(i.myTok),start(i.start),tokEnd(i.tokEnd),
endp(i.endp),token(0) {}
/**
* assignment operator.
*/
// everything, but not responsible for the allocated token.
iterator &operator = (const iterator &i)
{
myTok = i.myTok;
start = i.start; endp = i.endp; tokEnd = i.tokEnd;
if ( token )
delete [] token;
token = 0;
return *this;
}
/**
* shifts this iterator to the next token in the string.
*/
iterator &operator ++ () THROWS (NoSuchElementException);
/**
* returns the immutable string this iterator
* points to or '0' if no token is available (i.e.
* i == end()).
* Do not store pointers to this token, since it is
* invalidated for each iteration. If you need the token,
* copy it (e.g. with strdup());
*/
const char* operator * () THROWS (NoSuchElementException);
/**
* returns the next delimiter after the current token or
* '\\0', if there are no following delimiters.
* It returns the very next delimiter (even if
* skipAllDelim=true).
*/
inline char nextDelimiter() const
{return (tokEnd) ? *tokEnd : '\0';}
/**
* compares to other iterator. Usually used to
* compare against the end() iterator.
*/
// only compare the end-position. speed.
inline bool operator == (const iterator &other) const
{return (endp == other.endp);}
/**
* compares to other iterator. Usually used to
* compare against the end() iterator.
*/
// only compare the end position. speed.
inline bool operator != (const iterator &other) const
{return (endp != other.endp);}
};
private:
friend class StringTokenizer::iterator;
const char *str;
const char *delim;
bool skipAll, trim;
iterator itEnd;
public:
/**
* creates a new StringTokenizer for a string
* and a given set of delimiters.
*
* @param str String to be split up. This string will
* not be modified by this StringTokenizer,
* but you may as well not modfiy this string
* while tokenizing is in process, which may
* lead to undefined behaviour.
*
* @param delim String containing the characters
* which should be regarded as delimiters.
*
* @param skipAllDelim OPTIONAL.
* true, if subsequent
* delimiters should be skipped at once
* or false, if empty tokens should
* be returned for two delimiters with
* no other text inbetween. The first
* behaviour may be desirable for whitespace
* skipping, the second for input with
* delimited entry e.g. /etc/passwd like files
* or CSV input.
* NOTE, that 'true' here resembles the
* ANSI-C strtok(char *s,char *d) behaviour.
* DEFAULT = false
*
* @param trim OPTIONAL.
* true, if the tokens returned
* should be trimmed, so that they don't have
* any whitespaces at the beginning or end.
* Whitespaces are any of the characters
* defined in StringTokenizer::SPACE.
* If delim itself is StringTokenizer::SPACE,
* this will result in a behaviour with
* skipAllDelim = true.
* DEFAULT = false
*/
StringTokenizer (const char *str,
const char *delim,
bool skipAllDelim = false,
bool trim = false);
/**
* create a new StringTokenizer which splits the input
* string at whitespaces. The tokens are stripped from
* whitespaces. This means, if you change the set of
* delimiters in either the 'begin(const char *delim)' method
* or in 'setDelimiters()', you then get whitespace
* trimmed tokens, delimited by the new set.
* Behaves like StringTokenizer(s, StringTokenizer::SPACE,false,true);
*/
StringTokenizer (const char *s);
/**
* returns the begin iterator
*/
iterator begin() const
{return iterator(*this);}
/**
* changes the set of delimiters used in subsequent
* iterations.
*/
void setDelimiters (const char *d)
{delim = d;}
/**
* returns a begin iterator with an alternate set of
* delimiters.
*/
iterator begin(const char *d)
{
delim = d;
return iterator(*this);
}
/**
* the iterator marking the end.
*/
const iterator& end() const
{return itEnd;}
};
END_NAMESPACE
#endif
/** EMACS **
* Local variables:
* mode: c++
* c-basic-offset: 4
* End:
*/