aGrUM 2.3.2
a C++ library for (probabilistic) graphical models
CSVParser.cpp
Go to the documentation of this file.
1
20
29
30#ifndef DOXYGEN_SHOULD_SKIP_THIS
31
33# ifdef GUM_NO_INLINE
35# endif /* GUM_NO_INLINE */
36
37namespace gum {
38
39 namespace learning {
40
42 CSVParser::CSVParser(std::istream& instream,
43 const std::string& filename,
44 const std::string& delimiter,
45 const char commentmarker,
46 const char quoteMarker) :
47 _line_(),
48 _delimiter_(delimiter), _spaces_(" \t\r"), _delimiterPlusSpaces_(_delimiter_ + _spaces_),
49 _nbLine_(std::size_t(0)), _commentMarker_(commentmarker), _quoteMarker_(quoteMarker),
50 _emptyData_(true), _instream_(&instream), _filename_(filename) {
51 GUM_CONSTRUCTOR(CSVParser);
52 }
53
54
56 CSVParser::~CSVParser() { GUM_DESTRUCTOR(CSVParser); }
57
58
59 void CSVParser::_getNextTriplet_(const std::string& str,
60 std::size_t& first_letter_token,
61 std::size_t& next_token,
62 std::size_t& last_letter_token,
63 std::size_t from) const {
64 first_letter_token = str.find_first_not_of(_spaces_, from);
65
66 if (first_letter_token == std::string::npos) {
67 next_token = last_letter_token = first_letter_token;
68 return;
69 }
70
71 if (str.at(first_letter_token) == _quoteMarker_) {
72 last_letter_token = _correspondingQuoteMarker_(str, first_letter_token);
73
74 if (last_letter_token == std::string::npos)
75 GUM_SYNTAX_ERROR("String quote missing", _filename_, (Size)nbLine(), first_letter_token);
76
77 next_token = str.find_first_of(_delimiter_, last_letter_token + 1);
78 std::size_t next_char = str.find_first_not_of(_spaces_, last_letter_token + 1);
79
80 if (next_char < next_token) {
81 GUM_SYNTAX_ERROR("Delimiter missing", _filename_, (Size)nbLine(), next_char);
82 }
83
84 // remove quote chars from the token
85 first_letter_token += 1;
86 last_letter_token -= 1;
87 } else {
88 next_token = str.find_first_of(_delimiter_, first_letter_token);
89
90 if (next_token == std::string::npos) {
91 last_letter_token = str.find_last_not_of(_spaces_, next_token);
92 } else if (next_token == first_letter_token) {
93 last_letter_token = first_letter_token;
94 } else {
95 last_letter_token = str.find_last_not_of(_delimiterPlusSpaces_, next_token - 1);
96 }
97 }
98 }
99
100
101 void CSVParser::_tokenize_(const std::string& s) {
102 // looking for first commentMarker not in a string
103 std::size_t commentMarker = s.find_first_of(_commentMarker_, 0);
104 std::size_t quoteMarker = s.find_first_of(_quoteMarker_, 0);
105 std::size_t quoteMarkerEnd;
106
107 while (quoteMarker < commentMarker) {
108 quoteMarkerEnd = _correspondingQuoteMarker_(s, quoteMarker);
109
110 if (quoteMarkerEnd == std::string::npos)
111 GUM_SYNTAX_ERROR("String quote missing", _filename_, (Size)nbLine(), quoteMarker);
112
113 while (commentMarker < quoteMarkerEnd) { // the comment was in the quote
114 commentMarker = s.find_first_of(_commentMarker_, commentMarker + 1);
115 }
116
117 quoteMarker = s.find_first_of(_quoteMarker_, quoteMarkerEnd + 1);
118 }
119
120 std::string str = s.substr(0, commentMarker);
121
122 std::size_t counter = 0, first_letter_token, next_token, last_letter_token;
123
124 _getNextTriplet_(str, first_letter_token, next_token, last_letter_token, 0);
125
126 while ((std::string::npos != first_letter_token)
127 && (std::string::npos != last_letter_token)) {
128 if (_data_.size() <= counter) _data_.resize(counter + 1);
129
130 if (first_letter_token == next_token) {
131 _data_[counter] = "";
132 } else if (last_letter_token >= first_letter_token) {
133 const std::size_t fieldlength = last_letter_token + 1 - first_letter_token;
134 _data_[counter].resize(fieldlength);
135 _data_[counter].assign(str, first_letter_token, fieldlength);
136 } else {
137 _data_[counter] = "";
138 }
139
140 counter++;
141
142 if (next_token == std::string::npos) break;
143
144 _getNextTriplet_(str, first_letter_token, next_token, last_letter_token, next_token + 1);
145 }
146
147 // case where we end up with an empty field ...
148 if ((first_letter_token == std::string::npos) && (last_letter_token == first_letter_token)
149 && (next_token == first_letter_token)) {
150 counter++;
151 _data_.resize(counter);
152 _data_[counter - 1] = "";
153 } else {
154 _data_.resize(counter);
155 }
156
157 _emptyData_ = false;
158 }
159
160
162 void CSVParser::useNewStream(std::istream& instream,
163 const std::string& delimiter,
164 const char commentmarker,
165 const char quoteMarker) {
166 _line_.clear();
167 _delimiter_ = delimiter;
168 _spaces_ = " \t\r";
169 _delimiterPlusSpaces_ = _delimiter_ + _spaces_;
170 _nbLine_ = std::size_t(0);
171 _commentMarker_ = commentmarker;
172 _quoteMarker_ = quoteMarker;
173 _emptyData_ = true;
174 _instream_ = &instream;
175 _data_.clear();
176 }
177
178 } /* namespace learning */
179
180} /* namespace gum */
181
182#endif /* DOXYGEN_SHOULD_SKIP_THIS */
Class for fast parsing of CSV file (never more than one line in application memory).
CSVParser(std::istream &in, const std::string &filename, const std::string &delimiter=",", const char commentmarker='#', const char quoteMarker='"')
default constructor
#define GUM_SYNTAX_ERROR(msg, filename, line, column)
Definition exceptions.h:106
include the inlined functions if necessary
Definition CSVParser.h:54
gum is the global namespace for all aGrUM entities
Definition agrum.h:46