port of Israel Ekpo\'s CSV parser library

Dependents:   parser_sample IoTGateway_Basic

csv_parser.h

Committer:
hlipka
Date:
2011-01-24
Revision:
0:7c9aa931c67c

File content as of revision 0:7c9aa931c67c:

/*

Copyright (c) 2008 - 2009, Israel Ekpo
All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
    * Neither the name of Israel Ekpo nor the names of contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


*/
/**
 * csv_parser Header File
 *
 * This object is used to parse text documents that are delimited by some
 * type of character. Some of the common ones use spaces, tabs, commas and semi-colons.
 *
 * This is a list of common characters encountered by this program
 *
 * This list was prepared from the data from http://www.asciitable.com
 *
 * @li DEC is how it would be represented in decimal form (base 10)
 * @li HEX is how it would be represented in hexadecimal format (base 16)
 *
 * @li    DEC    HEX        Character Name
 * @li    0    0x00    null
 * @li    9    0x09    horizontal tab
 * @li    10    0x0A    line feed, new line
 * @li    13    0x0D    carriage return
 * @li    27    0x1B    escape
 * @li    32    0x20    space
 * @li    33    0x21    double quote
 * @li    39    0x27    single quote
 * @li    44    0x2C    comma
 * @li    92    0x5C    backslash
 *
 * @author Israel Ekpo <israel.ekpo@israelekpo.com>
 */

#ifndef CSV_PARSER_HPP_INCLUDED

#define CSV_PARSER_HPP_INCLUDED

#define LIBCSV_PARSER_MAJOR_VERSION 1

#define LIBCSV_PARSER_MINOR_VERSION 0

#define LIBCSV_PARSER_PATCH_VERSION 0

#define LIBCSV_PARSER_VERSION_NUMBER 10000

/* C++ header files */
#include <string>
#include <vector>

/* C header files */
#include <cstdio>
#include <cstring>
#include <cstdlib>

using namespace std;

/**
 * @typedef csv_row
 *
 * Data structure used to represent a record.
 *
 * This is an alias for vector <string>
 */
typedef vector <string> csv_row;

/**
 * @typedef csv_row_ptr
 *
 * Pointer to a csv_row object
 *
 * Expands to vector <string> *
 */
typedef csv_row * csv_row_ptr;

/**
 * @typedef enclosure_type_t
 *
 * This enum type is used to set the mode in which the CSV file is parsed.
 *
 * @li ENCLOSURE_NONE         (1) means the CSV file does not use any enclosure characters for the fields
 * @li ENCLOSURE_REQUIRED     (2) means the CSV file requires enclosure characters for all the fields
 * @li ENCLOSURE_OPTIONAL     (3) means the use of enclosure characters for the fields is optional
 *
 * The ENCLOSURE_TYPE_BEGIN and ENCLOSURE_TYPE_END members of this enum definition are never to be used.
 */
typedef enum
{
    ENCLOSURE_TYPE_BEGIN = 0,
    ENCLOSURE_NONE       = 1,
    ENCLOSURE_REQUIRED   = 2,
    ENCLOSURE_OPTIONAL   = 3,
    ENCLOSURE_TYPE_END

} enclosure_type_t;

/**
 * @def CSV_PARSER_FREE_BUFFER_PTR(ptr)
 *
 * Used to deallocate buffer pointers
 *
 * It deallocates the pointer only if it is not null
 */
#define CSV_PARSER_FREE_BUFFER_PTR(ptr)    \
if (ptr != NULL)                        \
{                                        \
    free(ptr);                            \
                                        \
    ptr = NULL;                            \
}

/**
 * @def CSV_PARSER_FREE_FILE_PTR(fptr)
 *
 * Used to close open file handles
 *
 * It closes the file only if it is not null
 */
#define CSV_PARSER_FREE_FILE_PTR(fptr)    \
if (fptr != NULL)                        \
{                                        \
    fclose(fptr);                        \
                                        \
    fptr = NULL;                        \
}

/**
 * @class csv_parser
 *
 * The csv_parser object
 *
 * Used to parse text files to extract records and fields.
 *
 * We are making the following assumptions :
 *
 * @li The record terminator is only one character in length.
 * @li The field terminator is only one character in length.
 * @li The fields are enclosed by single characters, if any.
 *
 * @li The parser can handle documents where fields are always enclosed, not enclosed at all or optionally enclosed.
 * @li When fields are strictly all enclosed, there is an assumption that any enclosure characters within the field are escaped by placing a backslash in front of the enclosure character.
 *
 * The CSV files can be parsed in 3 modes.
 * @li (a) No enclosures
 * @li (b) Fields always enclosed.
 * @li (c) Fields optionally enclosed.
 *
 * For option (c) when the enclosure character is optional, if an enclosure character is spotted at either the beginning
 * or the end of the string, it is assumed that the field is enclosed.
 *
 * The csv_parser::init() method can accept a character array as the path to the CSV file.
 * Since it is overloaded, it can also accept a FILE pointer to a stream that is already open for reading.
 *
 * The set_enclosed_char() method accepts the field enclosure character as the first parameter and the enclosure mode as the second parameter which
 * controls how the text file is going to be parsed.
 *
 * @see csv_parser::set_enclosed_char()
 * @see enclosure_type_t
 *
 * @todo Add ability to parse files where fields/columns are terminated by strings instead of just one char.
 * @todo Add ability to set strings where lines start by. Currently lines do not have any starting char or string.
 * @todo Add ability to set strings where line end by. Currently lines can only end with a single char.
 * @todo Add ability to accept other escape characters besides the backslash character 0x5C.
 * @todo More support for improperly formatted CSV data files.
 *
 * @author Israel Ekpo <israel.ekpo@israelekpo.com>
 */
class csv_parser
{

public :

    /**
     * Class constructor
     *
     * This is the default constructor.
     *
     * All the internal attributes are initialized here
     *
     * @li The enclosure character is initialized to NULL 0x00.
     * @li The escape character is initialized to the backslash character 0x5C.
     * @li The field delimiter character is initialized to a comma 0x2C.
     * @li The record delimiter character is initialized to a new line character 0x0A.
     *
     * @li The lengths of all the above-mentioned fields are initialized to 0,1,1 and 1 respectively.
     * @li The number of records to ignore is set to zero.
     * @li The more_rows internal attribute is set to false.
     * @li The pointer to the CSV input file is initialized to NULL
     * @li The pointer to the buffer for the file name is also initialized to NULL
     */
    csv_parser() : enclosed_char(0x00),     escaped_char(0x5C),
                   field_term_char(0x2C),      line_term_char(0x0A),
                   enclosed_length(0U),        escaped_length(1U),
                   field_term_length(1U),      line_term_length(1U),
                   ignore_num_lines(0U),       record_count(0U),
                   input_fp(NULL),               input_filename(NULL),
                   enclosure_type(ENCLOSURE_NONE),
                   more_rows(false)
                   { }

    /**
     * Class destructor
     *
     * In the class destructor the file pointer to the input CSV file is closed and
     * the buffer to the input file name is also deallocated.
     *
     * @see csv_parser::input_fp
     * @see csv_parser::input_filename
     */
    ~csv_parser()
    {
        CSV_PARSER_FREE_FILE_PTR(input_fp);

        CSV_PARSER_FREE_BUFFER_PTR(input_filename);
    }

    /**
     * Initializes the current object
     *
     * This init method accepts a pointer to the CSV file that has been opened for reading
     *
     * It also resets the file pointer to the beginning of the stream
     *
     * @overload bool init(FILE * input_file_pointer)
     * @param[in] input_file_pointer
     * @return bool Returns true on success and false on error.
     */
    bool init(FILE * input_file_pointer);

    /**
     * Initializes the current object
     *
     * @li This init method accepts a character array as the path to the csv file.
     * @li It sets the value of the csv_parser::input_filename property.
     * @li Then it creates a pointer to the csv_parser::input_fp property.
     *
     * @overload bool init(const char * input_filename)
     * @param[in] input_filename
     * @return bool Returns true on success and false on error.
     */
    bool init(const char * input_filename);

    /**
     * Defines the Field Enclosure character used in the Text File
     *
     * Setting this to NULL means that the enclosure character is optional.
     *
     * If the enclosure is optional, there could be fields that are enclosed, and fields that are not enclosed within the same line/record.
     *
     * @param[in] fields_enclosed_by The character used to enclose the fields.
     * @param[in] enclosure_mode How the CSV file should be parsed.
     * @return void
     */
    void set_enclosed_char(char fields_enclosed_by, enclosure_type_t enclosure_mode);

    /**
     * Defines the Field Delimiter character used in the text file
     *
     * @param[in] fields_terminated_by
     * @return void
     */
    void set_field_term_char(char fields_terminated_by);

    /**
     * Defines the Record Terminator character used in the text file
     *
     * @param[in] lines_terminated_by
     * @return void
     */
    void set_line_term_char(char lines_terminated_by);

    /**
     * Returns whether there is still more data
     *
     * This method returns a boolean value indicating whether or not there are
     * still more records to be extracted in the current file being parsed.
     *
     * Call this method to see if there are more rows to retrieve before invoking csv_parser::get_row()
     *
     * @see csv_parser::get_row()
     * @see csv_parser::more_rows
     *
     * @return bool Returns true if there are still more rows and false if there is not.
     */
    bool has_more_rows(void)
    {
        return more_rows;
    }

    /**
     * Defines the number of records to discard
     *
     * The number of records specified will be discarded during the parsing process.
     *
     * @see csv_parser::_skip_lines()
     * @see csv_parser::get_row()
     * @see csv_parser::has_more_rows()
     *
     * @param[in] lines_to_skip How many records should be skipped
     * @return void
     */
    void set_skip_lines(unsigned int lines_to_skip)
    {
        ignore_num_lines = lines_to_skip;
    }

    /**
     * Return the current row from the CSV file
     *
     * The row is returned as a vector of string objects.
     *
     * This method should be called only if csv_parser::has_more_rows() is true
     *
     * @see csv_parser::has_more_rows()
     * @see csv_parser::get_record_count()
     * @see csv_parser::reset_record_count()
     * @see csv_parser::more_rows
     *
     * @return csv_row A vector type containing an array of strings
     */
    csv_row get_row(void);

    /**
     * Returns the number of times the csv_parser::get_row() method has been invoked
     *
     * @see csv_parser::reset_record_count()
     * @return unsigned int The number of times the csv_parser::get_row() method has been invoked.
     */
    unsigned int get_record_count(void)
    {
        return record_count;
    }

    /**
     * Resets the record_count internal attribute to zero
     *
     * This may be used if the object is reused multiple times.
     *
     * @see csv_parser::record_count
     * @see csv_parser::get_record_count()
     * @return void
     */
    void reset_record_count(void)
    {
        record_count = 0U;
    }

private :

    /**
     * Ignores N records in the CSV file
     *
     * Where N is the value of the csv_parser::ignore_num_lines internal property.
     *
     * The number of lines skipped can be defined by csv_parser::set_skip_lines()
     *
     * @see csv_parser::set_skip_lines()
     *
     * @return void
     */
    void _skip_lines(void);

    /**
     * Reads a Single Line
     *
     * Reads a single record into the buffer passed by reference to the method
     *
     * @param[in,out] buffer A pointer to a character array for the current line.
     * @param[out] buffer_len A pointer to an integer storing the length of the buffer.
     * @return void
     */
    void _read_single_line(char ** buffer, unsigned int * buffer_len);

    /**
     * Extracts the fields without enclosures
     *
     * This is used when the enclosure character is not set
     * @param[out] row The vector of strings
     * @param[in] line The character array buffer containing the current record/line
     * @param[in] line_length The length of the buffer
     */
    void _get_fields_without_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length);

    /**
     * Extracts the fields with enclosures
     *
     * This is used when the enclosure character is set.
     *
     * @param[out] row The vector of strings
     * @param[in] line The character array buffer containing the current record/line
     * @param[in] line_length The length of the buffer
     */
    void _get_fields_with_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length);

    /**
     * Extracts the fields when enclosure is optional
     *
     * This is used when the enclosure character is optional
     *
     * Hence, there could be fields that use it, and fields that don't.
     *
     * @param[out] row The vector of strings
     * @param[in] line The character array buffer containing the current record/line
     * @param[in] line_length The length of the buffer
     */
    void _get_fields_with_optional_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length);

protected :

    /**
     * The enclosure character
     *
     * If present or used for a field it is assumed that both ends of the fields are wrapped.
     *
     * This is that single character used in the document to wrap the fields.
     *
     * @see csv_parser::_get_fields_without_enclosure()
     * @see csv_parser::_get_fields_with_enclosure()
     * @see csv_parser::_get_fields_with_optional_enclosure()
     *
     * @var enclosed_char
     */
    char enclosed_char;

    /**
     * The escape character
     *
     * For now the only valid escape character allowed is the backslash character 0x5C
     *
     * This is only important when the enclosure character is required or optional.
     *
     * This is the backslash character used to escape enclosure characters found within the fields.
     *
     * @see csv_parser::_get_fields_with_enclosure()
     * @see csv_parser::_get_fields_with_optional_enclosure()
     * @todo Update the code to accept other escape characters besides the backslash
     *
     * @var escaped_char
     */
    char escaped_char;

    /**
     * The field terminator
     *
     * This is the single character used to mark the end of a column in the text file.
     *
     * Common characters used include the comma, tab, and semi-colons.
     *
     * This is the single character used to separate fields within a record.
     *
     * @var field_term_char
     */
    char field_term_char;

    /**
     * The record terminator
     *
     * This is the single character used to mark the end of a record in the text file.
     *
     * The most popular one is the new line character however it is possible to use others as well.
     *
     * This is the single character used to mark the end of a record
     *
     * @see csv_parser::get_row()
     *
     * @var line_term_char
     */
    char line_term_char;

    /**
     * Enclosure length
     *
     * This is the length of the enclosure character
     *
     * @see csv_parser::csv_parser()
     * @see csv_parser::set_enclosed_char()
     *
     * @var enclosed_length
     */
    unsigned int enclosed_length;

    /**
     * The length of the escape character
     *
     * Right now this is really not being used.
     *
     * It may be used in future versions of the object.
     *
     * @todo Update the code to accept other escape characters besides the backslash
     *
     * @var escaped_length
     */
    unsigned int escaped_length;

    /**
     * Length of the field terminator
     *
     * For now this is not being used. It will be used in future versions of the object.
     *
     * @var field_term_length
     */
    unsigned int field_term_length;

    /**
     * Length of the record terminator
     *
     * For now this is not being used. It will be used in future versions of the object.
     *
     * @var line_term_length
     */
    unsigned int line_term_length;

    /**
     * Number of records to discard
     *
     * This variable controls how many records in the file are skipped before parsing begins.
     *
     * @see csv_parser::_skip_lines()
     * @see csv_parser::set_skip_lines()
     *
     * @var ignore_num_lines
     */
    unsigned int ignore_num_lines;

    /**
     * Number of times the get_row() method has been called
     *
     * @see csv_parser::get_row()
     * @var record_count
     */
    unsigned int record_count;

    /**
     * The CSV File Pointer
     *
     * This is the pointer to the CSV file
     *
     * @var input_fp
     */
    FILE * input_fp;

    /**
     * Buffer to input file name
     *
     * This buffer is used to store the name of the file that is being parsed
     *
     * @var input_filename
     */
    char * input_filename;

    /**
     * Mode in which the CSV file will be parsed
     *
     * The various values are explained below
     *
     * @li ENCLOSURE_NONE         (1) means the CSV file does not use any enclosure characters for the fields
     * @li ENCLOSURE_REQUIRED     (2) means the CSV file requires enclosure characters for all the fields
     * @li ENCLOSURE_OPTIONAL     (3) means the use of enclosure characters for the fields is optional
     *
     * @see csv_parser::get_row()
     * @see csv_parser::_read_single_line()
     * @see csv_parser::_get_fields_without_enclosure()
     * @see csv_parser::_get_fields_with_enclosure()
     * @see csv_parser::_get_fields_with_optional_enclosure()
     *
     * @var enclosure_type
     */
    enclosure_type_t enclosure_type;

    /**
     * There are still more records to parse
     *
     * This boolean property is an internal indicator of whether there are still records in the
     * file to be parsed.
     *
     * @see csv_parser::has_more_rows()
     * @var more_rows
     */
    bool more_rows;

}; /* class csv_parser */

#endif /* CSV_PARSER_HPP_INCLUDED */