taco-db  0.1.0
ExternalSort.h
Go to the documentation of this file.
1 #ifndef SORT_EXTERNALSORT_H
2 #define SORT_EXTERNALSORT_H
3 
4 #include "tdb.h"
5 
6 #include <functional>
7 
8 #include "extsort/ItemIterator.h"
10 #include "storage/FileManager.h"
11 
12 namespace taco {
13 
24 typedef std::function<int(const char *item1, const char *item2)>
26 
27 
35 class ExternalSort {
36 public:
43  static std::unique_ptr<ExternalSort>
44  Create(const SortCompareFunction& comp, size_t merge_ways);
45 
47  ~ExternalSort();
48 
57  std::unique_ptr<ItemIterator> Sort(ItemIterator *input_iter);
58 
59 private:
60 
65  class OutputIterator : public ItemIterator {
66  public:
72  OutputIterator(const File* tmpfile,
73  PageNumber final_num_pages);
74 
76  ~OutputIterator() override;
77 
85  bool Next() override;
86 
91  const char* GetCurrentItem(FieldOffset& p_reclen) override;
92 
97  bool SupportsRewind() const override { return true; }
98 
102  uint64_t SavePosition() const override {
103  RecordId rid = {m_pid, m_sid};
104  return *reinterpret_cast<uint64_t*>(&rid);
105  }
106 
114  void Rewind(uint64_t pos) override;
115 
116  /*
117  * Clean up state.
118  */
119  void EndScan() override;
120  private:
121 
122  const File* m_file;
123 
127  char* m_buf;
129  };
130 
131  ExternalSort(const SortCompareFunction& comp, size_t merge_ways);
132 
141  void SortInitialRuns(ItemIterator* item_iter);
142 
149  void GenerateNewPass(std::vector<PageNumber>& new_run_boundaries);
150 
158  PageNumber MergeInternalRuns(size_t low_run, size_t high_run);
159 
160  void WriteOutInitialPass(std::vector<Record>& pass);
161  void WriteOutRecord(Record& rec);
162 
164  uint8_t m_current_pass;
166  std::unique_ptr<File> m_file[2];
171  size_t m_merge_ways;
175  std::vector<PageNumber> m_run_boundaries;
176 
177  char* m_inputbuf;
178  char* m_outbuf;
181 };
182 
183 } // namespace taco
184 
185 #endif // SORT_EXTERNALSORT_H
The output operator that can used to scan through the output of this ExternalSort instance in ascendi...
Definition: ExternalSort.h:65
void EndScan() override
Ends the scan.
Definition: ExternalSort.cpp:245
~OutputIterator() override
Deconstructor to clean all the states.
Definition: ExternalSort.cpp:203
VarlenDataPage m_pg
Definition: ExternalSort.h:128
PageNumber m_pid
Definition: ExternalSort.h:124
void Rewind(uint64_t pos) override
Rewind to a particular position saved before.
Definition: ExternalSort.cpp:235
char * m_buf
Definition: ExternalSort.h:127
uint64_t SavePosition() const override
Save the position of current iterator so that later on can be rewind to this position.
Definition: ExternalSort.h:102
SlotId m_sid
Definition: ExternalSort.h:126
bool Next() override
Move the iterator to next output record.
Definition: ExternalSort.cpp:208
PageNumber m_final_num_pages
Definition: ExternalSort.h:125
bool SupportsRewind() const override
Indicate if this iterator supports rewind.
Definition: ExternalSort.h:97
OutputIterator(const File *tmpfile, PageNumber final_num_pages)
Construct an iterator from the tmp file containing the final sorted items.
Definition: ExternalSort.cpp:194
const File * m_file
Definition: ExternalSort.h:122
const char * GetCurrentItem(FieldOffset &p_reclen) override
Return the item pointed by this iterator.
Definition: ExternalSort.cpp:228
ExternalSort is a general utility class to do external sorting on any number of general bytes.
Definition: ExternalSort.h:35
ExternalSort(const SortCompareFunction &comp, size_t merge_ways)
Definition: ExternalSort.cpp:6
char * m_inputbuf
Definition: ExternalSort.h:177
size_t m_merge_ways
how many ways of merge this operator allows, implicitly define memory budget as (m_merge_ways + 1) pa...
Definition: ExternalSort.h:171
void SortInitialRuns(ItemIterator *item_iter)
Iterate through item_iterator and create the initial runs based on memory budget.
Definition: ExternalSort.cpp:60
std::unique_ptr< ItemIterator > Sort(ItemIterator *input_iter)
Given an input_iter, it iterates over input_iter in one pass, and sorts all the returned items using ...
Definition: ExternalSort.cpp:173
void WriteOutInitialPass(std::vector< Record > &pass)
Definition: ExternalSort.cpp:48
PageNumber MergeInternalRuns(size_t low_run, size_t high_run)
Merge the runs low_run (inclusive) to high_run (inclusive) from last pass whose boundaries are stored...
Definition: ExternalSort.cpp:97
SortCompareFunction m_cmp
compare functions for sorting items.
Definition: ExternalSort.h:173
char * m_outbuf
Definition: ExternalSort.h:178
VarlenDataPage m_outpg
Definition: ExternalSort.h:179
~ExternalSort()
Deconstructor of ExternalSort
Definition: ExternalSort.cpp:15
PageNumber m_output_pos
Definition: ExternalSort.h:180
void GenerateNewPass(std::vector< PageNumber > &new_run_boundaries)
Generate a new sorted pass through a m_merge_ways merge from m_file[1 - m_current_pass].
Definition: ExternalSort.cpp:159
std::unique_ptr< File > m_file[2]
tmp files used in external sorting
Definition: ExternalSort.h:166
void WriteOutRecord(Record &rec)
Definition: ExternalSort.cpp:28
uint8_t m_current_pass
0 or 1, indicates which file current pass is using.
Definition: ExternalSort.h:164
std::vector< PageNumber > m_run_boundaries
Saves the page boundaries of last pass sorted runs.
Definition: ExternalSort.h:175
static std::unique_ptr< ExternalSort > Create(const SortCompareFunction &comp, size_t merge_ways)
Create an ExternalSort instance where comp provides the method of comparision function of two items.
Definition: ExternalSort.cpp:23
Represents an open virtual file managed by the FileManager.
Definition: FileManager.h:322
A pure abstract iterator over items (i.e., plain byte arrays).
Definition: ItemIterator.h:18
Definition: Record.h:87
VarlenDataPage implements a buffered heap page that supports inserting, deleting, updating and random...
Definition: VarlenDataPage.h:50
Definition: datum.h:28
uint16_t SlotId
Definition: tdb_base.h:228
int16_t FieldOffset
Definition: tdb_base.h:217
uint32_t PageNumber
Definition: tdb_base.h:219
std::function< int(const char *item1, const char *item2)> SortCompareFunction
SortCompareFunction compares item1 and item2 and returns a negative integer if item1 < item2,...
Definition: ExternalSort.h:25
The record ID of a record on a page is a pair of ‘(PageNumber, SlotId)’.
Definition: Record.h:17