Skip to content
11 changes: 6 additions & 5 deletions cobs/construction/classic_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ void classic_combine_streams(

// read many blocks from each file, interleave them into new block, and
// write it out
std::vector<std::vector<char> > in_blocks(streams.size());
std::vector<std::vector<unsigned char> > in_blocks(streams.size());
for (size_t i = 0; i < streams.size(); ++i) {
in_blocks[i].resize(row_bytes[i] * batch_size);
}
Expand All @@ -266,7 +266,7 @@ void classic_combine_streams(
// read data from streams
for (size_t i = 0; i < streams.size(); ++i) {
streams[i].read(
in_blocks[i].data(), row_bytes[i] * this_batch);
(char*)(in_blocks[i].data()), row_bytes[i] * this_batch);
LOG << "stream[" << i << "] read " << streams[i].gcount();
die_unequal(row_bytes[i] * this_batch,
static_cast<size_t>(streams[i].gcount()));
Expand Down Expand Up @@ -322,6 +322,7 @@ void classic_combine_streams(

t.active("write");
ofs.write(out_block.data(), new_row_bytes * this_batch);
std::fill(out_block.begin(), out_block.end(), '\0');
}
t.stop();
}
Expand Down Expand Up @@ -567,12 +568,12 @@ void classic_construct(
fs::path tmp_path, ClassicIndexParameters params)
{
die_unless(params.num_hashes != 0);
die_unless(params.signature_size == 0);

// estimate signature size by finding number of elements in the largest file
uint64_t max_doc_size = get_max_file_size(filelist, params.term_size);
params.signature_size = calc_signature_size(
max_doc_size, params.num_hashes, params.false_positive_rate);
if (params.signature_size == 0)
params.signature_size = calc_signature_size(
max_doc_size, params.num_hashes, params.false_positive_rate);

size_t docsize_roundup = tlx::round_up(filelist.size(), 8);

Expand Down
56 changes: 55 additions & 1 deletion python/module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,22 @@
#include <cobs/file/classic_index_header.hpp>
#include <cobs/file/compact_index_header.hpp>
#include <cobs/query/classic_search.hpp>
#include <cobs/util/fs.hpp>
#include <cobs/util/calc_signature_size.hpp>

#include <cobs/settings.hpp>

#include <tlx/string.hpp>

/******************************************************************************/

uint64_t calc_signature_size(
uint64_t num_elements, double num_hashes,
double false_positive_rate)
{
return cobs::calc_signature_size(num_elements, num_hashes, false_positive_rate);
}

void classic_construct(
const std::string& input, const std::string& out_file,
const cobs::ClassicIndexParameters& index_params,
Expand All @@ -40,6 +49,13 @@ void classic_construct_list(
cobs::classic_construct(list, out_file, tmp_path, index_params);
}

void classic_construct_from_documents(
const cobs::DocumentList& list, const std::string& out_dir,
const cobs::ClassicIndexParameters& index_params)
{
cobs::classic_construct_from_documents(list, out_dir, index_params);
}

/******************************************************************************/

void compact_construct(
Expand Down Expand Up @@ -90,6 +106,7 @@ PYBIND11_MODULE(cobs_index, m) {
.. autosummary::
:toctree: _generated

calc_signature_size
classic_construct
classic_construct_list
compact_construct
Expand Down Expand Up @@ -143,7 +160,13 @@ PYBIND11_MODULE(cobs_index, m) {
.def_readwrite("term_size", &DocumentEntry::term_size_,
"fixed term (term) size or zero")
.def_readwrite("term_count", &DocumentEntry::term_count_,
"number of terms if fixed size");
"number of terms if fixed size")
.def("num_terms",
[](DocumentEntry& e, const size_t k) {
return e.num_terms(k);
},
"number of terms",
py::arg("k"));

using cobs::DocumentList;
py::class_<DocumentList>(
Expand Down Expand Up @@ -229,6 +252,23 @@ PYBIND11_MODULE(cobs_index, m) {
"keep_temporary", &ClassicIndexParameters::keep_temporary,
"keep temporary files during construction, default false");

/**************************************************************************/
// calc_signature_size()

m.def(
"calc_signature_size", &calc_signature_size, R"pbdoc(

Calculate the number of cells in a Bloom filter with k hash functions into which num_elements are inserted such that it has expected given fpr.

:param uint64_t num_elements:
:param double num_hashes:
:param double false_positive_rate:

)pbdoc",
py::arg("num_elements"),
py::arg("num_hashes"),
py::arg("false_positive_rate"));

/**************************************************************************/
// classic_construct()

Expand Down Expand Up @@ -266,6 +306,20 @@ Construct a COBS Classic Index from a pre-populated DocumentList object.
py::arg("index_params") = ClassicIndexParameters(),
py::arg("tmp_path") = "");

m.def(
"classic_construct_from_documents", &classic_construct_from_documents, R"pbdoc(

Construct a COBS Classic Index from a pre-populated DocumentList object.

:param DocumentList input: DocumentList object of documents to index
:param str out_dir: path to the output directory
:param ClassicIndexParameters index_params: instance of classic index parameter object

)pbdoc",
py::arg("list"),
py::arg("out_dir"),
py::arg("index_params") = ClassicIndexParameters());

/**************************************************************************/
// CompactIndexParameters

Expand Down
61 changes: 61 additions & 0 deletions src/cobs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <cobs/query/compact_index/mmap_search_file.hpp>
#include <cobs/settings.hpp>
#include <cobs/util/calc_signature_size.hpp>
#include <cobs/util/fs.hpp>
#ifdef __linux__
#include <cobs/query/compact_index/aio_search_file.hpp>
#endif
Expand Down Expand Up @@ -197,6 +198,11 @@ int classic_construct(int argc, char** argv) {
"term size (k-mer size), default: "
+ std::to_string(index_params.term_size));

cp.add_bytes(
's', "sig-size", index_params.signature_size,
"signature size, default: "
+ std::to_string(index_params.signature_size));

bool no_canonicalize = false;
cp.add_flag(
"no-canonicalize", no_canonicalize,
Expand Down Expand Up @@ -405,6 +411,57 @@ int compact_construct_combine(int argc, char** argv) {
return 0;
}

int classic_combine(int argc, char** argv) {
tlx::CmdlineParser cp;

cobs::ClassicIndexParameters index_params;

std::string in_dir;
cp.add_param_string(
"in-dir", in_dir, "path to the input directory");

std::string out_dir;
cp.add_param_string(
"out-dir", out_dir, "path to the output directory");

std::string out_file;
cp.add_param_string(
"out-file", out_file, "path to the output file");

cp.add_bytes(
'm', "memory", index_params.mem_bytes,
"memory in bytes to use, default: " +
tlx::format_iec_units(index_params.mem_bytes));

cp.add_size_t(
'T', "threads", index_params.num_threads,
"number of threads to use, default: max cores");

cp.add_flag(
"keep-temporary", index_params.keep_temporary,
"keep temporary files during construction");

if (!cp.sort().process(argc, argv))
return -1;

cp.print_result(std::cerr);

cobs::fs::path tmp_path(out_dir);
cobs::fs::path f;
size_t i = 1;

cobs::fs::copy(in_dir, tmp_path / cobs::pad_index(i));

while(!cobs::classic_combine(tmp_path / cobs::pad_index(i), tmp_path / cobs::pad_index(i + 1),
f, index_params.mem_bytes, index_params.num_threads,
index_params.keep_temporary)) {
i++;
};
cobs::fs::rename(f, out_file);

return 0;
}

/******************************************************************************/

static inline
Expand Down Expand Up @@ -992,6 +1049,10 @@ struct SubTool subtools[] = {
"compact-construct-combine", &compact_construct_combine, true,
"combines the classic indices in <in_dir> to form a compact index"
},
{
"classic-combine", &classic_combine, true,
"combines the classic indices in <in_dir>"
},
{
"query", &query, true,
"query an index"
Expand Down
143 changes: 143 additions & 0 deletions tests/classic_index_construction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,14 @@
*
* All rights reserved. Published under the MIT License in the LICENSE file.
******************************************************************************/
#include <fstream>
#include <algorithm>

#include <fstream>
#include <algorithm>

#include "test_util.hpp"
#include <cobs/file/classic_index_header.hpp>
#include <cobs/query/classic_index/mmap_search_file.hpp>
#include <cobs/util/calc_signature_size.hpp>
#include <cobs/util/file.hpp>
Expand All @@ -21,6 +27,40 @@ static fs::path index_dir = base_dir / "index";
static fs::path index_file = base_dir / "index.cobs_classic";
static fs::path tmp_path = base_dir / "tmp";

// Compare two classic indices. Return true if the bloom filters of both files are the same.
bool compare_classic_indices(const std::string& filename1, const std::string& filename2)
{
std::ifstream ifs1, ifs2;

cobs::deserialize_header<cobs::ClassicIndexHeader>(ifs1, filename1);
cobs::deserialize_header<cobs::ClassicIndexHeader>(ifs2, filename2);

std::istreambuf_iterator<char> begin1(ifs1);
std::istreambuf_iterator<char> begin2(ifs2);

return std::equal(begin1,std::istreambuf_iterator<char>(),begin2); //Second argument is end-of-range iterator
}

// Compare two files. Return true if the contents of both files are the same.
bool compare_files(const std::string& filename1, const std::string& filename2)
{
std::ifstream file1(filename1, std::ifstream::ate | std::ifstream::binary); //open file at the end
std::ifstream file2(filename2, std::ifstream::ate | std::ifstream::binary); //open file at the end
const std::ifstream::pos_type fileSize = file1.tellg();

if (fileSize != file2.tellg()) {
return false; //different file size
}

file1.seekg(0); //rewind
file2.seekg(0); //rewind

std::istreambuf_iterator<char> begin1(file1);
std::istreambuf_iterator<char> begin2(file2);

return std::equal(begin1,std::istreambuf_iterator<char>(),begin2); //Second argument is end-of-range iterator
}

class classic_index_construction : public ::testing::Test
{
protected:
Expand Down Expand Up @@ -151,4 +191,107 @@ TEST_F(classic_index_construction, combine) {
}
}

TEST_F(classic_index_construction, combined_index_same_as_classic_constructed) {
// This test starts with 2 copies of the same randomly generated document.
// We build a classic index for each of these two documents.
// We then combine these two classic indices into one combined index.
// The combined index should be the same as the classic index generated
// through `cobs:classic_construct` on these two documents.
using cobs::pad_index;
fs::create_directories(index_dir);
fs::create_directories(index_dir/pad_index(0));
fs::create_directories(index_dir/pad_index(1));
fs::create_directories(index_dir/pad_index(2));

// prepare 2 copy of a randomly generated document
std::string random_doc_src_string = cobs::random_sequence(1000, 1);
auto random_docs = generate_documents_one(random_doc_src_string, 1);
generate_test_case(random_docs, "random_", input_dir/pad_index(0));
generate_test_case(random_docs, "random_", input_dir/pad_index(1));

cobs::ClassicIndexParameters index_params;
index_params.false_positive_rate = 0.001; // in order to use large signature size
index_params.mem_bytes = 80;
index_params.num_threads = 1;
index_params.continue_ = true;

// generate a classic index for each document
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(0)),
index_dir/pad_index(0)/(pad_index(0) + ".cobs_classic"),
tmp_path, index_params);
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(1)),
index_dir/pad_index(0)/(pad_index(1) + ".cobs_classic"),
tmp_path, index_params);

// generate a combined index fro both classic constructed index
fs::path combined_index;
cobs::classic_combine(index_dir/pad_index(0), index_dir/pad_index(1), combined_index,
80, 1, false);

// generate a classic index for both docs through classic_construct
std::string classic_constructed_index = index_dir/pad_index(2)/(pad_index(0) +
".cobs_classic");
cobs::classic_construct(cobs::DocumentList(input_dir), classic_constructed_index,
tmp_path, index_params);

ASSERT_TRUE(compare_files(combined_index.string(), classic_constructed_index));
}

TEST_F(classic_index_construction, same_documents_combined_into_same_index) {
// This test starts with 18 copies of the same randomly generated document.
// These documents are split in 4 groups: g1 with 1 copy, g2 with 2 copies,
// g7 with 7 copies and g8 with 8 copies.
// A classic index is constructed through `cobs::classic_construct` for these
// 4 groups: c1, c2, c7 and c8.
// We then combine these 4 classic indices in this way: c1 + c8 = c1c8 and
// c2 + c7 = c2c7.
// The point of this test is to verify that c1c8 and c2c7 are the same.
using cobs::pad_index;
fs::create_directories(index_dir);
fs::create_directories(index_dir/pad_index(0));
fs::create_directories(index_dir/pad_index(1));
fs::create_directories(index_dir/pad_index(18));
fs::create_directories(index_dir/pad_index(27));

// prepare 4 groups of copies of a randomly generated document
std::string random_doc_src_string = cobs::random_sequence(1000, 1);
auto random_doc_one_copy = generate_documents_one(random_doc_src_string, 1);
auto random_doc_two_copies = std::vector<cobs::KMerBuffer<31> >(2, random_doc_one_copy[0]);
auto random_doc_seven_copies = std::vector<cobs::KMerBuffer<31> >(7, random_doc_one_copy[0]);
auto random_doc_eight_copies = std::vector<cobs::KMerBuffer<31> >(8, random_doc_one_copy[0]);
generate_test_case(random_doc_one_copy, "random_", input_dir/pad_index(1));
generate_test_case(random_doc_two_copies, "random_", input_dir/pad_index(2));
generate_test_case(random_doc_seven_copies, "random_", input_dir/pad_index(7));
generate_test_case(random_doc_eight_copies, "random_", input_dir/pad_index(8));

cobs::ClassicIndexParameters index_params;
index_params.false_positive_rate = 0.001; // in order to use large signature size
index_params.mem_bytes = 80;
index_params.num_threads = 1;
index_params.continue_ = true;

// generate a classic index for each document groups
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(1)),
index_dir/pad_index(18)/(pad_index(1) + ".cobs_classic"),
tmp_path, index_params);
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(8)),
index_dir/pad_index(18)/(pad_index(8) + ".cobs_classic"),
tmp_path, index_params);
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(2)),
index_dir/pad_index(27)/(pad_index(2) + ".cobs_classic"),
tmp_path, index_params);
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(7)),
index_dir/pad_index(27)/(pad_index(7) + ".cobs_classic"),
tmp_path, index_params);

// generate a combined index fro both classic constructed index
fs::path c1c8, c2c7;
cobs::classic_combine(index_dir/pad_index(18), index_dir/pad_index(0), c1c8,
80, 1, false);
cobs::classic_combine(index_dir/pad_index(27), index_dir/pad_index(1), c2c7,
80, 1, false);

ASSERT_TRUE(compare_classic_indices(c1c8.string(), c2c7.string()));
}

/******************************************************************************/