00001 #include "lm/builder/corpus_count.hh"
00002
00003 #include "lm/builder/payload.hh"
00004 #include "lm/common/ngram_stream.hh"
00005 #include "lm/common/ngram.hh"
00006
00007 #include "util/file.hh"
00008 #include "util/file_piece.hh"
00009 #include "util/tokenize_piece.hh"
00010 #include "util/stream/chain.hh"
00011 #include "util/stream/stream.hh"
00012
00013 #define BOOST_TEST_MODULE CorpusCountTest
00014 #include <boost/test/unit_test.hpp>
00015
00016 namespace lm { namespace builder { namespace {
00017
00018 #define Check(str, cnt) { \
00019 BOOST_REQUIRE(stream); \
00020 w = stream->begin(); \
00021 for (util::TokenIter<util::AnyCharacter, true> t(str, " "); t; ++t, ++w) { \
00022 BOOST_CHECK_EQUAL(*t, v[*w]); \
00023 } \
00024 BOOST_CHECK_EQUAL((uint64_t)cnt, stream->Value().count); \
00025 ++stream; \
00026 }
00027
00028 BOOST_AUTO_TEST_CASE(Short) {
00029 util::scoped_fd input_file(util::MakeTemp("corpus_count_test_temp"));
00030 const char input[] = "looking on a little more loin\non a little more loin\non foo little more loin\nbar\n\n";
00031
00032
00033
00034
00035 util::WriteOrThrow(input_file.get(), input, sizeof(input) - 1);
00036 util::FilePiece input_piece(input_file.release(), "temp file");
00037
00038 util::stream::ChainConfig config;
00039 config.entry_size = NGram<BuildingPayload>::TotalSize(3);
00040 config.total_memory = config.entry_size * 20;
00041 config.block_count = 2;
00042
00043 util::scoped_fd vocab(util::MakeTemp("corpus_count_test_vocab"));
00044
00045 util::stream::Chain chain(config);
00046 uint64_t token_count;
00047 WordIndex type_count = 10;
00048 std::vector<bool> prune_words;
00049 CorpusCount counter(input_piece, vocab.get(), token_count, type_count, prune_words, "", chain.BlockSize() / chain.EntrySize(), SILENT);
00050 chain >> boost::ref(counter);
00051 NGramStream<BuildingPayload> stream(chain.Add());
00052 chain >> util::stream::kRecycle;
00053
00054 const char *v[] = {"<unk>", "<s>", "</s>", "looking", "on", "a", "little", "more", "loin", "foo", "bar"};
00055
00056 WordIndex *w;
00057
00058 Check("<s> <s> looking", 1);
00059 Check("<s> looking on", 1);
00060 Check("looking on a", 1);
00061 Check("on a little", 2);
00062 Check("a little more", 2);
00063 Check("little more loin", 2);
00064 Check("more loin </s>", 2);
00065 Check("<s> <s> on", 2);
00066 Check("<s> on a", 1);
00067 Check("<s> on foo", 1);
00068 Check("on foo little", 1);
00069 Check("foo little more", 1);
00070 Check("little more loin", 1);
00071 Check("more loin </s>", 1);
00072 Check("<s> <s> bar", 1);
00073 Check("<s> bar </s>", 1);
00074 Check("<s> <s> </s>", 1);
00075 BOOST_CHECK(!stream);
00076 BOOST_CHECK_EQUAL(sizeof(v) / sizeof(const char*), type_count);
00077 }
00078
00079 }}}