00001 #pragma once 00002 00003 #include "util/string_piece.hh" 00004 00005 namespace MosesTraining { 00006 namespace Syntax { 00007 00008 enum TreeFragmentTokenType { 00009 TreeFragmentToken_EOS, 00010 TreeFragmentToken_LSB, 00011 TreeFragmentToken_RSB, 00012 TreeFragmentToken_WORD 00013 }; 00014 00015 struct TreeFragmentToken { 00016 public: 00017 TreeFragmentToken(TreeFragmentTokenType, StringPiece, std::size_t); 00018 TreeFragmentTokenType type; 00019 StringPiece value; 00020 std::size_t pos; 00021 }; 00022 00023 // Tokenizes tree fragment strings in Moses format. 00024 // 00025 // For example, the string "[S [NP [NN weasels]] [VP]]" is tokenized to the 00026 // sequence: 00027 // 00028 // 1 LSB "[" 00029 // 2 WORD "S" 00030 // 3 LSB "[" 00031 // 4 WORD "NP" 00032 // 5 LSB "[" 00033 // 6 WORD "NN" 00034 // 7 WORD "a" 00035 // 8 RSB "]" 00036 // 9 RSB "]" 00037 // 10 LSB "[" 00038 // 11 WORD "VP" 00039 // 12 RSB "]" 00040 // 13 RSB "]" 00041 // 14 EOS undefined 00042 // 00043 class TreeFragmentTokenizer { 00044 public: 00045 TreeFragmentTokenizer(); 00046 TreeFragmentTokenizer(const StringPiece &); 00047 00048 const TreeFragmentToken &operator*() const { return value_; } 00049 const TreeFragmentToken *operator->() const { return &value_; } 00050 00051 TreeFragmentTokenizer &operator++(); 00052 TreeFragmentTokenizer operator++(int); 00053 00054 friend bool operator==(const TreeFragmentTokenizer &, 00055 const TreeFragmentTokenizer &); 00056 00057 friend bool operator!=(const TreeFragmentTokenizer &, 00058 const TreeFragmentTokenizer &); 00059 00060 private: 00061 StringPiece str_; 00062 TreeFragmentToken value_; 00063 StringPiece::const_iterator iter_; 00064 StringPiece::const_iterator end_; 00065 std::size_t pos_; 00066 }; 00067 00068 } // namespace Syntax 00069 } // namespace MosesTraining