Code:
public class CustomDelimitedLineTokenizer implements LineTokenizer {
/**
* Convenient constant for the common case of a tab delimiter.
*/
public static final char DELIMITER_TAB = '\t';
/**
* Convenient constant for the common case of a comma delimiter.
*/
public static final char DELIMITER_COMMA = ',';
/**
* Convenient constant for the common case of a " character used to escape
* delimiters or line endings.
*/
public static final char DEFAULT_QUOTE_CHARACTER = '"';
// the delimiter character used when reading input.
private String delimiter;
private char quoteCharacter = DEFAULT_QUOTE_CHARACTER;
private String quoteString;
private Collection<Integer> includedFields = null;
private String names;
private FieldSetFactory fieldSetFactory = new DefaultFieldSetFactory();
/**
* Create a new instance of the {@link CustomDelimitedLineTokenizer} class for the
* common case where the delimiter is a {@link #DELIMITER_COMMA comma}.
*
* @see #DelimitedLineTokenizer(char)
* @see #DELIMITER_COMMA
*/
public CustomDelimitedLineTokenizer() {
setQuoteCharacter(DEFAULT_QUOTE_CHARACTER);
}
public CustomDelimitedLineTokenizer(String delimiter) {
this.delimiter = delimiter;
setQuoteCharacter(DEFAULT_QUOTE_CHARACTER);
}
/**
* Setter for the delimiter character.
*
* @param delimiter
*/
public void setDelimiter(String delimiter) {
this.delimiter = delimiter;
}
/**
* The fields to include in the output by position (starting at 0). By
* default all fields are included, but this property can be set to pick out
* only a few fields from a larger set. Note that if field names are
* provided, their number must match the number of included fields.
*
* @param includedFields the included fields to set
*/
public void setIncludedFields(int[] includedFields) {
this.includedFields = new HashSet<Integer>();
for (int i : includedFields) {
this.includedFields.add(i);
}
}
/**
* Public setter for the quoteCharacter. The quote character can be used to
* extend a field across line endings or to enclose a String which contains
* the delimiter. Inside a quoted token the quote character can be used to
* escape itself, thus "a""b""c" is tokenized to a"b"c.
*
* @param quoteCharacter the quoteCharacter to set
*
* @see #DEFAULT_QUOTE_CHARACTER
*/
public final void setQuoteCharacter(char quoteCharacter) {
this.quoteCharacter = quoteCharacter;
this.quoteString = "" + quoteCharacter;
}
/**
* If the string is quoted strip (possibly with whitespace outside the
* quotes (which will be stripped), replace escaped quotes inside the
* string. Quotes are escaped with double instances of the quote character.
*
* @param string
* @return the same string but stripped and unescaped if necessary
*/
private String maybeStripQuotes(String string) {
String value = string.trim();
if (isQuoted(value)) {
value = StringUtils.replace(value, "" + quoteCharacter + quoteCharacter, "" + quoteCharacter);
int endLength = value.length() - 1;
// used to deal with empty quoted values
if (endLength == 0) {
endLength = 1;
}
value = value.substring(1, endLength);
return value;
}
return string;
}
/**
* Is this string surrounded by quote characters?
*
* @param value
* @return true if the value starts and ends with the
* {@link #quoteCharacter}
*/
private boolean isQuoted(String value) {
if (value.startsWith(quoteString) && value.endsWith(quoteString)) {
return true;
}
return false;
}
/**
* Is the supplied character a quote character?
*
* @param s the character to be checked
* @return <code>true</code> if the supplied character is an quote character
* @see #setQuoteCharacter(char)
*/
protected boolean isQuoteCharacter(String s) {
return String.valueOf(quoteCharacter).equals(s);
}
/**
* Is the supplied string the delimiter string?
*
* @param s the string to be checked
* @return <code>true</code> if the supplied string is the delimiter string
* @see CustomDelimitedLineTokenizer#DelimitedLineTokenizer(char)
*/
protected boolean isDelimiterCharacter(String s) {
return delimiter.equals(s);
}
private boolean isDelimiterMultiString() {
return this.delimiter.length() >= 2;
}
public void setNames(String names) {
this.names = names;
}
public FieldSet tokenize(String line) {
List<String> tokens = new ArrayList<String>();
// line is never null in current implementation
// line is checked in parent: AbstractLineTokenizer.tokenize()
char[] chars = line.toCharArray();
boolean inQuoted = false;
int lastCut = 0;
int length = chars.length;
int fieldCount = 0;
for (int i = 0; i < length; i++) {
String currentChar = String.valueOf(chars[i]);
boolean isEnd = (i == (length - 1));
boolean isDelimiter = false;
if (delimiter.startsWith(currentChar)) {
isDelimiter = (isDelimiterMultiString() ? isDelimiterCharacter(getChars(
chars, i, delimiter.length())) : isDelimiterCharacter(currentChar));
}
if ((isDelimiter && !inQuoted) || isEnd) {
int endPosition = (isEnd ? length - lastCut : (i - lastCut));
if (isEnd && isDelimiterCharacter(currentChar)) {
endPosition--;
}
if (includedFields == null || includedFields.contains(fieldCount)) {
String value = maybeStripQuotes(new String(chars, lastCut, endPosition));
tokens.add(value);
}
if (isDelimiterMultiString()) {
i += this.delimiter.length() - 1;
}
fieldCount++;
if (isEnd && (isDelimiterCharacter(currentChar))) {
if (includedFields == null || includedFields.contains(fieldCount)) {
tokens.add("");
}
fieldCount++;
}
lastCut = i + 1;
} else if (isQuoteCharacter(currentChar)) {
inQuoted = !inQuoted;
}
}
FieldSet fieldSet = fieldSetFactory.create(tokens.toArray(new String[tokens.size()]));
return fieldSet;
}
private String getChars(char[] lineArray, int start, int length) {
if (start + length > lineArray.length) {
length = lineArray.length - start - 1;
}
return new String(lineArray, start, length);
}
}