00001
00002 #pragma once
00003
00004
00005
00006 #include<iostream>
00007
00008
00009 #include <unicode/stringpiece.h>
00010 #include <unicode/translit.h>
00011 #include <unicode/utypes.h>
00012 #include <unicode/unistr.h>
00013 #include <unicode/uchar.h>
00014 #include <unicode/utf8.h>
00015 #include <vector>
00016
00017 #include "moses/TranslationModel/UG/mm/tpt_typedefs.h"
00018
00019
00020 namespace stringdist
00021 {
00022 float
00023 levenshtein(UChar const* a, size_t const lenA,
00024 UChar const* b, size_t const lenB);
00025
00026 UErrorCode strip_accents(UnicodeString & trg);
00027
00028 float
00029 fillAlignmentMatrix(UChar const* a, size_t const lenA,
00030 UChar const* b, size_t const lenB,
00031 std::vector<std::vector<float> > & M);
00032
00033 class StringDiff
00034 {
00035 public:
00036 enum MATCHTYPE
00037 {
00038 same,
00039 cap,
00040 flip,
00041 permutation,
00042 accent,
00043 duplication,
00044 insertion,
00045 deletion,
00046 mismatch,
00047 noinit
00048 };
00049
00050 struct Segment
00051 {
00052 static char const* elabel[];
00053 int start_a, end_a;
00054 int start_b, end_b;
00055 MATCHTYPE match;
00056 float dist;
00057 Segment();
00058 Segment(size_t const as, size_t const ae,
00059 size_t const bs, size_t const be,
00060 UnicodeString const& a,
00061 UnicodeString const& b);
00062 char const* label() const;
00063 };
00064 private:
00065 UnicodeString a,b;
00066 std::vector<Segment> difflist;
00067 std::vector<int> diffcnt;
00068 public:
00069 UnicodeString const& set_a(std::string const& a);
00070 UnicodeString const& set_b(std::string const& b);
00071 UnicodeString const& get_a() const;
00072 UnicodeString const& get_b() const;
00073 StringDiff(std::string const& a, std::string const& b);
00074 StringDiff();
00075 size_t size();
00076 size_t align(bool force=false);
00077 void showDiff(std::ostream& out);
00078 float levenshtein();
00079 Segment const& operator[](uint32_t i) const;
00080 void fillAlignmentMatrix(std::vector<std::vector<float> > & M) const;
00081 vector<int> const& getFeatures() const;
00082 };
00083 }