Add base::tok::split_tokens() as a better alternative to split_strings()

dacap · Jun 6, 2024 · 5de03f6 · 5de03f6
1 parent c82fd98
commit 5de03f6
Show file tree

Hide file tree

Showing 2 changed files with 157 additions and 0 deletions.
diff --git a/base/tok.h b/base/tok.h
@@ -0,0 +1,113 @@
+// LAF Base Library
+// Copyright (c) 2024 Igara Studio S.A.
+// Copyright (c) 2020 David Capello
+//
+// This file is released under the terms of the MIT license.
+// Read LICENSE.txt for more information.
+//
+// Based on https://github.com/dacap/tok
+
+#ifndef BASE_TOK_H_INCLUDED
+#define BASE_TOK_H_INCLUDED
+#pragma once
+
+#include <iterator>
+#include <string>
+
+namespace base {
+namespace tok {
+
+struct ignore_empties { enum { allow_empty = false }; };
+struct include_empties { enum { allow_empty = true }; };
+
+template<typename T, typename EmptyPolicy>
+class token_iterator {
+public:
+  using iterator_category = std::forward_iterator_tag;
+  using internal_iterator = typename T::const_iterator;
+  using char_type = typename T::value_type;
+  using value_type = typename std::remove_const<T>::type;
+  using difference_type = typename T::difference_type;
+  using pointer = T*;
+  using reference = T&;
+  using const_reference = const T&;
+
+  token_iterator() = delete;
+  token_iterator(const token_iterator&) = default;
+  token_iterator(const internal_iterator& begin,
+                 const internal_iterator& end,
+                 char_type chr) :
+    begin_(begin),
+    inter_(begin),
+    end_(end),
+    chr_(chr) {
+    operator++(); // Find first word to fill "str_" field
+  }
+
+  token_iterator& operator++() {
+    if constexpr (EmptyPolicy::allow_empty) {
+      if (inter_ != end_ && *inter_ == chr_) {
+        ++inter_;
+      }
+    }
+    else {
+      while (inter_ != end_ && *inter_ == chr_) {
+        ++inter_;
+      }
+    }
+    begin_ = inter_;
+    while (inter_ != end_ && *inter_ != chr_) {
+      ++inter_;
+    }
+    str_.assign(begin_, inter_);
+    return *this;
+  }
+
+  const_reference operator*() {
+    return str_;
+  }
+
+  bool operator!=(const token_iterator& that) const {
+    return (begin_ != that.end_);
+  }
+
+private:
+  internal_iterator begin_, inter_, end_;
+  char_type chr_;
+  value_type str_;
+};
+
+template<typename T, typename Empties>
+class token_range {
+public:
+  using char_type = typename T::value_type;
+  using iterator = token_iterator<T, Empties>;
+
+  token_range(const T& str, char_type chr) : str_(str), chr_(chr) { }
+
+  iterator begin() const { return iterator(str_.begin(), str_.end(), chr_); }
+  iterator end() const { return iterator(str_.end(), str_.end(), chr_); }
+
+private:
+  const T& str_;
+  char_type chr_;
+};
+
+template<typename T>
+token_range<T, ignore_empties>
+split_tokens(const T& str,
+             typename T::value_type chr) {
+  return token_range<T, ignore_empties>(str, chr);
+}
+
+template<typename T>
+token_range<T, include_empties>
+csv(const T& str,
+    typename T::value_type chr = ',') {
+  return token_range<T, include_empties>(str, chr);
+}
+
+} // namespace tok
+} // namespace base
+
+#endif
diff --git a/base/tok_tests.cpp b/base/tok_tests.cpp
@@ -0,0 +1,44 @@
+// LAF Base Library
+// Copyright (c) 2024 Igara Studio S.A.
+// Copyright (c) 2020 David Capello
+//
+// This file is released under the terms of the MIT license.
+// Read LICENSE.txt for more information.
+//
+// Based on https://github.com/dacap/tok
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "base/tok.h"
+
+TEST(Tok, SplitTokens)
+{
+  int i = 0;
+  auto a_result = std::vector<std::string>{ "This", "is", "a", "phrase.", "Several", "whitespaces", "are", "ignored." };
+  std::string a = "This is a phrase.   Several whitespaces are ignored.";
+  for (auto& tok : base::tok::split_tokens(a, ' ')) {
+    std::cout << "\"" << tok << "\"\n";
+    EXPECT_EQ(tok, a_result[i++]);
+  }
+}
+
+TEST(Tok, Csv)
+{
+  int i = 0;
+  auto b_result = std::vector<std::string>{ "In comma", "separated", "", "values", "", "", "empties are included" };
+  std::string b = "In comma,separated,,values,,,empties are included";
+  for (auto& tok : base::tok::csv(b, ',')) {
+    std::cout << "\"" << tok << "\"\n";
+    EXPECT_EQ(tok, b_result[i++]);
+  }
+}
+
+int main(int argc, char** argv)
+{
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}