taco-db  0.1.0
ExternalSort.h
Go to the documentation of this file.
1 #ifndef SORT_EXTERNALSORT_H
2 #define SORT_EXTERNALSORT_H
3 
4 #include "tdb.h"
5 
6 #include <functional>
7 
8 #include "extsort/ItemIterator.h"
10 #include "storage/FileManager.h"
11 
12 namespace taco {
13 
24 typedef std::function<int(const char *item1, const char *item2)>
26 
27 
35 class ExternalSort {
36 public:
43  static std::unique_ptr<ExternalSort>
44  Create(const SortCompareFunction& comp, size_t merge_ways);
45 
47  ~ExternalSort();
48 
57  std::unique_ptr<ItemIterator> Sort(ItemIterator *input_iter);
58 
59 private:
60 
65  class OutputIterator : public ItemIterator {
66  public:
72  OutputIterator(const File* tmpfile,
73  PageNumber final_num_pages);
74 
76  ~OutputIterator() override;
77 
85  bool Next() override;
86 
91  const char* GetCurrentItem(FieldOffset& p_reclen) override;
92 
97  bool SupportsRewind() const override { return true; }
98 
102  uint64_t SavePosition() const override {
103  // TODO implement it
104  return 0;
105  }
106 
114  void Rewind(uint64_t pos) override;
115 
116  /*
117  * Clean up state.
118  */
119  void EndScan() override;
120  private:
121 
122  // You can add your own states for the iterator here.
123  };
124 
125  ExternalSort(const SortCompareFunction& comp, size_t merge_ways);
126 
135  void SortInitialRuns(ItemIterator* item_iter);
136 
143  void GenerateNewPass(std::vector<PageNumber>& new_run_boundaries);
144 
152  PageNumber MergeInternalRuns(size_t low_run, size_t high_run);
153 
154  void WriteOutInitialPass(std::vector<Record>& pass);
155  void WriteOutRecord(Record& rec);
156 
158  uint8_t m_current_pass;
160  std::unique_ptr<File> m_file[2];
165  size_t m_merge_ways;
169  std::vector<PageNumber> m_run_boundaries;
170 
171  char* m_inputbuf;
172  char* m_outbuf;
175 };
176 
177 } // namespace taco
178 
179 #endif // SORT_EXTERNALSORT_H
The output operator that can used to scan through the output of this ExternalSort instance in ascendi...
Definition: ExternalSort.h:65
void EndScan() override
Ends the scan.
Definition: ExternalSort.cpp:150
~OutputIterator() override
Deconstructor to clean all the states.
Definition: ExternalSort.cpp:128
void Rewind(uint64_t pos) override
Rewind to a particular position saved before.
Definition: ExternalSort.cpp:145
uint64_t SavePosition() const override
Save the position of current iterator so that later on can be rewind to this position.
Definition: ExternalSort.h:102
bool Next() override
Move the iterator to next output record.
Definition: ExternalSort.cpp:133
bool SupportsRewind() const override
Indicate if this iterator supports rewind.
Definition: ExternalSort.h:97
OutputIterator(const File *tmpfile, PageNumber final_num_pages)
Construct an iterator from the tmp file containing the final sorted items.
Definition: ExternalSort.cpp:122
const char * GetCurrentItem(FieldOffset &p_reclen) override
Return the item pointed by this iterator.
Definition: ExternalSort.cpp:139
ExternalSort is a general utility class to do external sorting on any number of general bytes.
Definition: ExternalSort.h:35
ExternalSort(const SortCompareFunction &comp, size_t merge_ways)
Definition: ExternalSort.cpp:6
char * m_inputbuf
Definition: ExternalSort.h:171
size_t m_merge_ways
how many ways of merge this operator allows, implicitly define memory budget as (m_merge_ways + 1) pa...
Definition: ExternalSort.h:165
void SortInitialRuns(ItemIterator *item_iter)
Iterate through item_iterator and create the initial runs based on memory budget.
Definition: ExternalSort.cpp:60
std::unique_ptr< ItemIterator > Sort(ItemIterator *input_iter)
Given an input_iter, it iterates over input_iter in one pass, and sorts all the returned items using ...
Definition: ExternalSort.cpp:117
void WriteOutInitialPass(std::vector< Record > &pass)
Definition: ExternalSort.cpp:48
PageNumber MergeInternalRuns(size_t low_run, size_t high_run)
Merge the runs low_run (inclusive) to high_run (inclusive) from last pass whose boundaries are stored...
Definition: ExternalSort.cpp:97
SortCompareFunction m_cmp
compare functions for sorting items.
Definition: ExternalSort.h:167
char * m_outbuf
Definition: ExternalSort.h:172
VarlenDataPage m_outpg
Definition: ExternalSort.h:173
~ExternalSort()
Deconstructor of ExternalSort
Definition: ExternalSort.cpp:15
PageNumber m_output_pos
Definition: ExternalSort.h:174
void GenerateNewPass(std::vector< PageNumber > &new_run_boundaries)
Generate a new sorted pass through a m_merge_ways merge from m_file[1 - m_current_pass].
Definition: ExternalSort.cpp:103
std::unique_ptr< File > m_file[2]
tmp files used in external sorting
Definition: ExternalSort.h:160
void WriteOutRecord(Record &rec)
Definition: ExternalSort.cpp:28
uint8_t m_current_pass
0 or 1, indicates which file current pass is using.
Definition: ExternalSort.h:158
std::vector< PageNumber > m_run_boundaries
Saves the page boundaries of last pass sorted runs.
Definition: ExternalSort.h:169
static std::unique_ptr< ExternalSort > Create(const SortCompareFunction &comp, size_t merge_ways)
Create an ExternalSort instance where comp provides the method of comparision function of two items.
Definition: ExternalSort.cpp:23
Represents an open virtual file managed by the FileManager.
Definition: FileManager.h:322
A pure abstract iterator over items (i.e., plain byte arrays).
Definition: ItemIterator.h:18
Definition: Record.h:87
VarlenDataPage implements a buffered heap page that supports inserting, deleting, updating and random...
Definition: VarlenDataPage.h:50
Definition: datum.h:28
int16_t FieldOffset
Definition: tdb_base.h:211
uint32_t PageNumber
Definition: tdb_base.h:213
std::function< int(const char *item1, const char *item2)> SortCompareFunction
SortCompareFunction compares item1 and item2 and returns a negative integer if item1 < item2,...
Definition: ExternalSort.h:25