CsvParser.java

// Generated by delombok at Mon Nov 18 07:27:48 UTC 2024
package de.larssh.utils.text;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import de.larssh.utils.annotations.PackagePrivate;
import de.larssh.utils.io.PeekableReader;

/**
 * This class holds all information required to parse a CSV data stream and
 * encapsulates the parsing algorithm using {@link #parse()}.
 */
@PackagePrivate
class CsvParser {
	/**
	 * Asserts the validity of {@code separator} and {@escaper} as CSV control
	 * characters.
	 *
	 * @param separator the separator character
	 * @param escaper   the escaping character
	 * @throws IllegalArgumentException on illegal {@code separator} or
	 *                                  {@escaper} value
	 */
	@PackagePrivate
	@SuppressWarnings({"PMD.AvoidLiteralsInIfCondition", "PMD.CyclomaticComplexity"})
	static void assertCsvInput(final char separator, final char escaper) {
		if (escaper == separator) {
			throw new IllegalArgumentException("The escape and separator characters must not be equal.");
		}
		if (escaper == '\r') {
			throw new IllegalArgumentException("The escape character must not be \'\r\'.");
		}
		if (escaper == '\n') {
			throw new IllegalArgumentException("The escape character must not be \'\n\'.");
		}
		if (separator == '\r') {
			throw new IllegalArgumentException("The separator character must not be \'\r\'.");
		}
		if (separator == '\n') {
			throw new IllegalArgumentException("The separator character must not be \'\n\'.");
		}
	}

	/**
	 * Tries to read a new line from {@code reader}. This method accepts either of
	 * {@code \n}, {@code \r\n} or {@code \r} as new line sequence.
	 *
	 * @param reader a {@link PeekableReader} as data input
	 * @return {@code true} if a new line was read., else {@code false}
	 * @throws IOException if an I/O error occurs
	 */
	@PackagePrivate
	static boolean readNewLine(final PeekableReader reader) throws IOException {
		if (!reader.hasNext()) {
			return false;
		}
		final char character = reader.peek();
		if (character != '\r' && character != '\n') {
			return false;
		}
		reader.next();
		if (character == '\r' && reader.hasNext() && reader.peek() == '\n') {
			reader.next();
		}
		return true;
	}

	/**
	 * The CSV separator character
	 */
	private final char separator;
	/**
	 * The CSV escaping character
	 */
	private final char escaper;

	/**
	 * Checks if {@character} is either the CSV separator character or a new line
	 * character.
	 *
	 * @param character the character to check
	 * @return {@code true} if {@character} is either the separator character or a
	 *         new line character, else {@code false}
	 */
	@PackagePrivate
	boolean isSeparatorOrNewLine(final char character) {
		return character == separator || character == '\r' || character == '\n';
	}

	/**
	 * Parses the CSV data given by {@code reader}, starting at the current
	 * position.
	 *
	 * @param reader a {@link Reader} as CSV data input
	 * @return an object representing the parsed CSV data
	 * @throws IllegalArgumentException on illegal {@code separator} or
	 *                                  {@escaper} value
	 * @throws IOException              if an I/O error occurs
	 */
	public Csv parse(final Reader reader) throws IOException {
		try (PeekableReader peekableReader = new PeekableReader(reader)) {
			return parse(peekableReader);
		}
	}

	/**
	 * Parses the CSV data given by {@code reader}, starting at the current
	 * position.
	 *
	 * @param reader a {@link PeekableReader} as CSV data input
	 * @return an object representing the parsed CSV data
	 * @throws IllegalArgumentException on illegal {@code separator} or
	 *                                  {@escaper} value
	 * @throws IOException              if an I/O error occurs
	 */
	@SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops")
	private Csv parse(final PeekableReader reader) throws IOException {
		assertCsvInput(separator, escaper);
		final Csv csv = new Csv();
		// Make sure, an empty input results in an empty result with no row
		if (!reader.hasNext()) {
			return csv.unmodifiable();
		}
		List<String> currentRow = new CsvRow(csv, csv.size(), new ArrayList<>());
		csv.add(currentRow);
		while (reader.hasNext()) {
			if (readNewLine(reader)) {
				// Ignore a trailing new line
				if (!reader.hasNext()) {
					return csv.unmodifiable();
				}
				// Add new row
				currentRow = new CsvRow(csv, csv.size(), new ArrayList<>());
				csv.add(currentRow);
			} else {
				// Parse the next value and add it to the current row
				currentRow.add(parseValue(reader));
				// In case of a separator, continue with the next value of the current row
				while (reader.hasNext() && reader.peek() == separator) {
					reader.next();
					currentRow.add(parseValue(reader));
				}
			}
		}
		return csv.unmodifiable();
	}

	/**
	 * Parses a single CSV value, starting at the current position and stopping
	 * right behind its last character.
	 *
	 * <p>
	 * In case an escaped CSV value comes with leading or trailing spaces, they are
	 * stripped off of the actual value.
	 *
	 * @param reader a {@link PeekableReader} as data input
	 * @return the parsed CSV value
	 * @throws IOException if an I/O error occurs
	 */
	@PackagePrivate
	@SuppressWarnings("PMD.PrematureDeclaration")
	String parseValue(final PeekableReader reader) throws IOException {
		final StringBuilder builder = new StringBuilder();
		// Check if the given value is escaped. Leading whitespaces need to be read to
		// determine that.
		final boolean isEscaped = readLeadingWhitespacesAndIsEscaped(reader, builder);
		while (reader.hasNext()) {
			// Check if control characters, such as the separator or new line characters,
			// need to be handled in their special way. If the given value is escaped, the
			// escape character needs to precede such characters to take effect.
			if (readEscaperAndIsControlCharHandling(reader, isEscaped)) {
				// If the given value is escaped and the escaping character is followed by
				// whitespaces only, we trim the trailing whitespaces.
				final String trailingWhitespaces = isEscaped ? readWhitespaces(reader) : "";
				// In case of a separator or new line character the current value was read
				// successfully
				if (!reader.hasNext() || isSeparatorOrNewLine(reader.peek())) {
					return builder.toString();
				}
				if (isEscaped) {
					// Else we simply append the escaping character and further unexpected
					// characters.
					builder.append(escaper).append(trailingWhitespaces);
				}
			}
			// Append the current character
			builder.append(reader.next());
		}
		return builder.toString();
	}

	/**
	 * Checks if control characters need to be handled.
	 *
	 * <p>
	 * For non-escaped values control characters always need to be handled. For
	 * escaped values the value needs to be "closed" using the escaping character
	 * prior handling following control characters.
	 *
	 * @param reader    a {@link PeekableReader} as data input
	 * @param isEscaped {@code true} if the current value is escaped, else
	 *                  {@code false}
	 * @return {@code true} if control characters need to be handled, else
	 *         {@code false}
	 * @throws IOException if an I/O error occurs
	 */
	@PackagePrivate
	boolean readEscaperAndIsControlCharHandling(final PeekableReader reader, final boolean isEscaped) throws IOException {
		if (!isEscaped) {
			return true;
		}
		if (!reader.hasNext() || reader.peek() != escaper) {
			return false;
		}
		reader.next();
		return !reader.hasNext() || reader.peek() != escaper;
	}

	/**
	 * Checks if the value, that starts at the reader's position, is escaped.
	 * Leading whitespaces need to be read to determine that.
	 *
	 * <p>
	 * The reader is forwarded to the first character of the value. In case of an
	 * escaped value, the escaping character is read.
	 *
	 * <p>
	 * Leading whitespaces are trimmed in case of an escaped value, else they are
	 * appended to {@code builder}.
	 *
	 * @param reader  a {@link PeekableReader} as data input
	 * @param builder a builder to possibly append whitespaces to.
	 * @return {@code true} if the value, that starts at the reader's position, is
	 *         escaped, else {@code false}
	 * @throws IOException if an I/O error occurs
	 */
	@PackagePrivate
	@SuppressWarnings("PMD.PrematureDeclaration")
	boolean readLeadingWhitespacesAndIsEscaped(final PeekableReader reader, final StringBuilder builder) throws IOException {
		final String whitespaces = readWhitespaces(reader);
		if (reader.hasNext() && reader.peek() == escaper) {
			reader.next();
			return true;
		}
		builder.append(whitespaces);
		return false;
	}

	/**
	 * Reads whitespace characters without a special meaning starting at the current
	 * position of the reader.
	 *
	 * <p>
	 * Whitespace characters are defined using
	 * {@link Characters#isAsciiWhitespace(char)}. Separator, escaping and new line
	 * characters are not handled as whitespace character, even if they could be an
	 * whitespace character.
	 *
	 * @param reader a {@link PeekableReader} as data input
	 * @return the read whitespace characters
	 * @throws IOException if an I/O error occurs
	 */
	@PackagePrivate
	String readWhitespaces(final PeekableReader reader) throws IOException {
		final StringBuilder builder = new StringBuilder();
		while (reader.hasNext()) {
			final char character = reader.peek();
			// Stop reading when finding the first non-whitespace character
			if (character == escaper || !Characters.isAsciiWhitespace(character) || isSeparatorOrNewLine(character)) {
				return builder.toString();
			}
			builder.append(reader.next());
		}
		return builder.toString();
	}

	@edu.umd.cs.findbugs.annotations.NonNull
	@java.lang.Override
	@java.lang.SuppressWarnings("all")
	@edu.umd.cs.findbugs.annotations.SuppressFBWarnings(justification = "generated code")
	@lombok.Generated
	public java.lang.String toString() {
		return "CsvParser(separator=" + this.separator + ", escaper=" + this.escaper + ")";
	}

	/**
	 * Creates a new {@code CsvParser} instance.
	 *
	 * @param separator The CSV separator character
	 * @param escaper The CSV escaping character
	 */
	@java.lang.SuppressWarnings("all")
	@edu.umd.cs.findbugs.annotations.SuppressFBWarnings(justification = "generated code")
	@lombok.Generated
	public CsvParser(final char separator, final char escaper) {
		this.separator = separator;
		this.escaper = escaper;
	}
}