辞書アプリの改良:部分一致(5)
関連エントリ
StarDictの辞書アプリ - Random Note
辞書アプリの改良 - Random Note
辞書アプリの改良:初期化処理の高速化(1) - Random Note
辞書アプリの改良:初期化処理の高速化(2) - Random Note
辞書アプリの改良:コマンド化 - Random Note
辞書アプリの改良:部分一致(1) - Random Note
辞書アプリの改良:部分一致(2) - Random Note
辞書アプリの改良:部分一致(3) - Random Note
辞書アプリの改良:部分一致(4) - Random Note
とりあえずできた
実行結果はこんな感じ
# groovy dict.groovy あいして search: あいして あいしているのにくちにだしていえない あいしてる あいしてるっていわなくてもいいわ あいしてるとくりかえしていう あいしてるよ、ちゅっちゅっ いいあいしてもかちめがない これらのひとびとはたいぶんかをあいしている ちょっとあいして、ながくあいして ちょっとあいして、ながくあいして とてもあいしているよ まだあいしていながら まちがってあいしてしまう もし、わたしのことをあいしてるなら count:13 TIME[All]:2540
ソースはこう。
BaseIndex, Index, MetaIndexは辞書アプリの改良:部分一致(4) - Random Noteのままなので省略。
dict.groovy
def dictionary = new Dictionary("test") key = args[0] println "search: ${key}" def timer = new Timer("All") def resultList = dictionary.search(key) for (idx in resultList) { println idx.word if (resultList.size() <= 10) { println "=> " + dictionary.getDefinition(idx) } } println "count:${resultList.size()}" timer.stop() class Dictionary { def meta def file def Dictionary(String fileName) { meta = new MetaDictionary(fileName) file = new RandomAccessFile(fileName + ".idx", "r") } def search(key) { def keyFragments = cut(key) def metaIndexesList = [] for (fragment in keyFragments) { metaIndexesList << meta.search(fragment.word) } if (metaIndexesList.empty) return [] // 計算量が少なくなるよう、数が少ない順に並び替え metaIndexesList.sort {l, r -> return r.size() <=> l.size()} def andList = metaIndexesList.get(0).clone() int i=1 while (i<metaIndexesList.size()) { def mapA = new HashMap() def mapB = new HashMap() metaIndexesList.get(i).each {metaIndex -> append(mapA, metaIndex)} andList.each {metaIndex -> if (mapA.containsKey(metaIndex.offset)) append(mapB, metaIndex) } def tmpList = [] mapB.keySet().each {offset -> tmpList.addAll(mapA.get(offset)) tmpList.addAll(mapB.get(offset)) } andList = tmpList i++ } def resultList = [] def map = new HashMap() andList.each {append(map, it)} map.keySet().each {offsetKey -> def metaList = map.get(offsetKey) def baseList = metaList.findAll {x -> x.word == keyFragments[0].word} baseList.each {base -> def flag = true for (fragment in keyFragments) { flag = flag && metaList.any { x -> fragment.word == x.word && (x.wordOffset - base.wordOffset) == fragment.wordOffset } } if (flag) resultList << getIndex(base.offset, base.length) } } resultList.sort {l, r-> l.word <=> r.word} return resultList } def append(Map map, MetaIndex idx) { if (map.containsKey(idx.offset)) { map.get(idx.offset) << idx } else { map.put(idx.offset, [idx]) } } def cut(string) { if (string.length() < MetaIndex.KEY_CHAR_LENGTH) return [new MetaIndex(string, 0, 0, 0)] def list = [] int i=0 while (i <string.length()-MetaIndex.KEY_CHAR_LENGTH +1) { list << new MetaIndex(string.substring(i, i+MetaIndex.KEY_CHAR_LENGTH), 0, 0, i) i++ } return list } def getIndex(int offset, int length) { def buf = new byte[length] file.seek(offset) file.read(buf) return new Index(buf, 0, length) } def getDefinition(index) { def defIn = new FileInputStream("test.dict") def buf = new byte[index.length] RandomAccessFile randamFile = new RandomAccessFile("test.dict", "r") randamFile.seek(index.offset) randamFile.read(buf) randamFile.close() return new String(buf, "UTF-8") } } class MetaDictionary { def file = null int count = 0 def MetaDictionary(fileName) { file = new RandomAccessFile(fileName + ".meta", "r") count = file.length() / MetaIndex.LENGTH } def search(key) { assert (key.length() <= MetaIndex.KEY_CHAR_LENGTH) int start = searchStart(key, 0, count) int end = searchEnd(key, start, count) if (start > end) return loadIndex(0, 0) return loadIndex(start, (end-start+1)) } def getIndex(int i) { int len = MetaIndex.LENGTH def buf = new byte[len] file.seek(i * MetaIndex.LENGTH) file.read(buf) return new MetaIndex(buf) } def loadIndex(indexOffset = 0, indexCount = -1) { if (indexOffset < 0) indexOffset = 0 if (indexCount == -1 || file.length() < (indexCount * MetaIndex.LENGTH)) indexCount = count int len = (indexCount * MetaIndex.LENGTH) byte[] buf = new byte[len] file.seek(indexOffset * MetaIndex.LENGTH) file.read(buf) int i=0 def indexes = []; while (i<indexCount) { indexes << new MetaIndex(buf, i * MetaIndex.LENGTH) i ++ } return indexes } def searchStart(key, int start, int end) { if (start == end || end < 0) return start int i = (int)Math.floor((start+end)/2) def idx = getIndex(i) def c = idx.compareTo(key) // println "start:${start}, end:${end}, i:${i}, c:${c}, idx:${idx}, key:${key}, key:${key.getBytes()}" if (c >= 0) { return searchStart(key, start, i) } else { if ((end-start)==1) i = end return searchStart(key, i, end) } } def searchEnd(key, start, end) { if (start == end || end < 0) return end def i = (int)Math.ceil((start+end)/2) if (i>end) i = end def idx = getIndex(i) def c = idx.compareTo(key) // println "start:${start}, end:${end}, i:${i}, c:${c}, idx:${idx}, key:${key}, key:${key.getBytes()}" if (c <= 0) { return searchEnd(key, i, end) } else { if ((end-start)==1) i = start return searchEnd(key, start, i) } } } class Timer { def name def time = new Date() def Timer(n) {name = n} def stop() { println "TIME[${name}]:${(new Date().getTime() - time.getTime())}" } }