taco-db  0.1.0
CSVReader.h
Go to the documentation of this file.
1 #ifndef UTILS_CSVREADER_H
2 #define UTILS_CSVREADER_H
3 
4 #include "tdb.h"
5 
6 #include "catalog/Schema.h"
7 #include "storage/FSFile.h"
8 #include "storage/Record.h"
9 
10 namespace taco {
11 
12 class CSVReader {
13 public:
17  CSVReader(bool has_header = false, char delim = ',');
18 
19  ~CSVReader();
20 
21  void FeedBuffer(char *buf, size_t n);
22 
23  bool
24  Next(absl::string_view &field) {
25  if (m_header_skipped)
26  return NextImpl(field);
27  while (NextImpl(field)) {
28  if (AtEOL()) {
29  m_header_skipped = true;
30  return NextImpl(field);
31  }
32  }
33  return false;
34  }
35 
36  bool
37  AtEOL() const {
38  return m_eol;
39  }
40 
41  bool
42  AtEOF() const {
43  return m_ipos == m_bufsz;
44  }
45 
46  size_t
47  GetCurLineNumber() const {
48  return m_lno + 1;
49  }
50 
51  size_t
52  GetCurCharNumber() const {
53  return m_ipos - m_llpos + 1;
54  }
55 
56 private:
57  bool NextImpl(absl::string_view &field);
58 
60 
61  const char m_delim;
62 
63  bool m_eol;
64 
66 
68 
70 
71  char *m_buf;
72 
73  size_t m_bufsz;
74 
75  size_t m_ipos;
76 
77  size_t m_lno;
78 
79  ptrdiff_t m_llpos;
80 
81  std::string m_outbuf;
82 };
83 
84 class FileCSVReader: public CSVReader {
85 public:
86  FileCSVReader(bool has_header = false,
87  char delim = ',',
88  size_t bufsize = (size_t) 65536);
89 
90  void FeedFile(absl::string_view filename, bool is_last_file = true);
91 
92  bool Next(absl::string_view &field);
93 
97  void SetSchema(const Schema *sch);
98 
103  const std::vector<Datum> *NextDeserializedRecord();
104 
109 
110 private:
112 
113  std::unique_ptr<FSFile> m_fsfile;
114 
115  size_t m_fsize;
116 
118 
120 
121  const size_t m_bufsz;
122 
123  const Schema *m_schema;
124 
125  std::vector<FunctionInfo> m_infuncs;
126 
127  std::vector<Datum> m_data;
128 
130 };
131 
132 }
133 
134 #endif // UTILS_CSVREADER_H
Definition: CSVReader.h:12
~CSVReader()
Definition: CSVReader.cpp:23
bool Next(absl::string_view &field)
Definition: CSVReader.h:24
CSVReader(bool has_header=false, char delim=',')
has_header currently only has the effect of skipping the first line.
Definition: CSVReader.cpp:8
bool AtEOL() const
Definition: CSVReader.h:37
bool m_header_skipped
Definition: CSVReader.h:59
bool m_broken_field_ends_in_quote
Definition: CSVReader.h:69
char * m_buf
Definition: CSVReader.h:71
bool m_broken_field_in_quote
Definition: CSVReader.h:67
size_t GetCurLineNumber() const
Definition: CSVReader.h:47
bool m_in_broken_field
Definition: CSVReader.h:65
bool AtEOF() const
Definition: CSVReader.h:42
bool m_eol
Definition: CSVReader.h:63
size_t m_bufsz
Definition: CSVReader.h:73
std::string m_outbuf
Definition: CSVReader.h:81
size_t GetCurCharNumber() const
Definition: CSVReader.h:52
size_t m_ipos
Definition: CSVReader.h:75
bool NextImpl(absl::string_view &field)
Definition: CSVReader.cpp:34
size_t m_lno
Definition: CSVReader.h:77
void FeedBuffer(char *buf, size_t n)
Definition: CSVReader.cpp:26
const char m_delim
Definition: CSVReader.h:61
ptrdiff_t m_llpos
Definition: CSVReader.h:79
Definition: CSVReader.h:84
std::vector< Datum > m_data
Definition: CSVReader.h:127
std::unique_ptr< FSFile > m_fsfile
Definition: CSVReader.h:113
const Schema * m_schema
Definition: CSVReader.h:123
FileCSVReader(bool has_header=false, char delim=',', size_t bufsize=(size_t) 65536)
Definition: CSVReader.cpp:168
void SetSchema(const Schema *sch)
May only be called once.
Definition: CSVReader.cpp:234
size_t m_nbytes_read
Definition: CSVReader.h:117
const std::vector< Datum > * NextDeserializedRecord()
The returned vector points to an internal buffer.
Definition: CSVReader.cpp:259
const size_t m_bufsz
Definition: CSVReader.h:121
bool Next(absl::string_view &field)
Definition: CSVReader.cpp:198
void FeedFile(absl::string_view filename, bool is_last_file=true)
Definition: CSVReader.cpp:181
size_t m_fsize
Definition: CSVReader.h:115
unique_malloced_ptr m_buf
Definition: CSVReader.h:119
bool m_is_last_file
Definition: CSVReader.h:111
std::vector< FunctionInfo > m_infuncs
Definition: CSVReader.h:125
Record NextSerializedRecord()
The returned record points to an internal buffer.
Definition: CSVReader.cpp:307
maxaligned_char_buf m_recbuf
Definition: CSVReader.h:129
Definition: Record.h:87
A Schema object stores the information for accessing an ordered set of typed fields either from a dis...
Definition: Schema.h:39
Definition: datum.h:28
std::vector< char, AlignedAllocImpl::aligned_allocator< 8, char > > maxaligned_char_buf
Definition: tdb_base.h:155
std::unique_ptr< void, AlignedAllocImpl::FreeMem > unique_malloced_ptr
Definition: tdb_base.h:94