csv.hpp 6.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#ifndef CSV_READER_HPP
#define CSV_READER_HPP

#include <fstream>
#include <iostream>
#include <mutex>
#include <sstream>
#include <stdexcept>
#include <string>
#include <thread>
#include <vector>

namespace csv {

/**
 * @brief Parses a CSV line into individual fields, handling quoted fields with
 * commas and newlines.
 *
 * @param line The CSV line to parse.
 * @return A vector of strings, each representing a field in the CSV line.
 */
inline std::vector<std::string> parse_csv_line(const std::string& line) {
  std::vector<std::string> result;
  std::string field;
  bool in_quotes = false;

  for (size_t i = 0; i < line.length(); ++i) {
    char c = line[i];

    if (c == '"') {
      // Handle double quotes inside quoted fields
      if (in_quotes && i + 1 < line.length() && line[i + 1] == '"') {
        field += '"';
        ++i;
      } else {
        in_quotes = !in_quotes;
      }
    } else if (c == ',' && !in_quotes) {
      result.push_back(field);
      field.clear();
    } else {
      field += c;
    }
  }
  result.push_back(field);
  return result;
}

/**
 * @brief Reads a CSV file and returns a vector of pairs containing column names
 * and their corresponding data vectors.
 *
 * This function reads the header to obtain column names and uses multithreading
 * to read and parse the CSV file in chunks.
 *
 * @param filename The path to the CSV file.
 * @return A vector of pairs, each containing a column name and a vector of data
 * for that column.
 */
inline std::vector<std::pair<std::string, std::vector<std::string>>> read_csv(const std::string& filename) {
  std::cout << "Reading CSV file: " << filename << std::endl;
  // Open the file
  std::ifstream file(filename);
  if (!file) {
    throw std::runtime_error("Cannot open file");
  }

  // Read the header line and parse column names
  std::string header_line;
  std::getline(file, header_line);
  std::vector<std::string> column_names = parse_csv_line(header_line);

  // Prepare the result vector with column names
  std::vector<std::pair<std::string, std::vector<std::string>>> result;
  for (const auto& name : column_names) {
    result.emplace_back(name, std::vector<std::string>());
  }

  // Read the rest of the file into a string buffer
  std::stringstream buffer;
  buffer << file.rdbuf();
  std::string content = buffer.str();

  // Determine the number of threads to use
  unsigned int num_threads = std::thread::hardware_concurrency();
  if (num_threads == 0)
    num_threads = 4;  // Default to 4 threads if hardware_concurrency returns 0

  // Calculate chunk start positions based on content size
  std::vector<size_t> chunk_starts;
  size_t content_size = content.size();
  size_t chunk_size = content_size / num_threads;

  chunk_starts.push_back(0);
  for (unsigned int i = 1; i < num_threads; ++i) {
    size_t pos = i * chunk_size;
    // Adjust position to the next newline character to ensure we start at the
    // beginning of a line
    while (pos < content_size && content[pos] != '\n') {
      ++pos;
    }
    if (pos < content_size) {
      ++pos;  // Skip the newline character
    }
    chunk_starts.push_back(pos);
  }
  chunk_starts.push_back(content_size);

  // Create threads to parse each chunk
  std::vector<std::vector<std::vector<std::string>>> thread_results(num_threads);
  std::vector<std::thread> threads;

  for (unsigned int i = 0; i < num_threads; ++i) {
    size_t start = chunk_starts[i];
    size_t end = chunk_starts[i + 1];

    threads.emplace_back([&content, start, end, &thread_results, i]() {
      std::vector<std::vector<std::string>> local_result;
      size_t pos = start;
      while (pos < end) {
        size_t next_pos = content.find('\n', pos);
        if (next_pos == std::string::npos || next_pos > end) {
          next_pos = end;
        }
        std::string line = content.substr(pos, next_pos - pos);
        if (!line.empty()) {
          local_result.push_back(parse_csv_line(line));
        }
        pos = next_pos + 1;
      }
      thread_results[i] = std::move(local_result);
    });
  }

  // Wait for all threads to finish
  for (auto& t : threads) {
    t.join();
  }

  // Combine the results from all threads into the final result
  for (const auto& local_result : thread_results) {
    for (const auto& row : local_result) {
      for (size_t i = 0; i < row.size(); ++i) {
        if (i < result.size()) {
          result[i].second.push_back(row[i]);
        }
      }
    }
  }

  return result;
}

/**
 * @brief Writes the CSV data into a file.
 *
 * @param filename The path to the output CSV file.
 * @param data A vector of pairs, each containing a column name and a vector of
 * data for that column.
 */
inline void write_csv(const std::string& filename,
                      const std::vector<std::pair<std::string, std::vector<std::string>>>& data) {
  std::cout << "Writing CSV file: " << filename << std::endl;

  // Open the file for writing
  std::ofstream file(filename);
  if (!file) {
    throw std::runtime_error("Cannot open file for writing");
  }

  // Check that all columns have the same number of rows
  if (data.empty()) {
    return;  // Nothing to write
  }
  size_t num_rows = data[0].second.size();
  for (const auto& column : data) {
    if (column.second.size() != num_rows) {
      throw std::runtime_error("All columns must have the same number of rows");
    }
  }

  // Write the header
  for (size_t i = 0; i < data.size(); ++i) {
    file << data[i].first;
    if (i != data.size() - 1) {
      file << ',';
    }
  }
  file << '\n';

  // Write the data rows
  for (size_t row = 0; row < num_rows; ++row) {
    for (size_t col = 0; col < data.size(); ++col) {
      const std::string& field = data[col].second[row];
      // Handle CSV escaping
      std::string escaped_field = field;
      bool needs_quotes = false;
      if (escaped_field.find('"') != std::string::npos) {
        needs_quotes = true;
        // Escape double quotes
        size_t pos = 0;
        while ((pos = escaped_field.find('"', pos)) != std::string::npos) {
          escaped_field.insert(pos, "\"");
          pos += 2;
        }
      }
      if (escaped_field.find(',') != std::string::npos || escaped_field.find('\n') != std::string::npos) {
        needs_quotes = true;
      }
      if (needs_quotes) {
        file << '"' << escaped_field << '"';
      } else {
        file << escaped_field;
      }
      if (col != data.size() - 1) {
        file << ',';
      }
    }
    file << '\n';
  }
}

}  // namespace csv

#endif  // CSV_READER_HPP