辞書アプリの改良:部分一致(5)

とりあえずできた

実行結果はこんな感じ

# groovy dict.groovy あいして
search: あいして
あいしているのにくちにだしていえない
あいしてる
あいしてるっていわなくてもいいわ
あいしてるとくりかえしていう
あいしてるよ、ちゅっちゅっ
いいあいしてもかちめがない
これらのひとびとはたいぶんかをあいしている
ちょっとあいして、ながくあいして
ちょっとあいして、ながくあいして
とてもあいしているよ
まだあいしていながら
まちがってあいしてしまう
もし、わたしのことをあいしてるなら
count:13
TIME[All]:2540

ソースはこう。
BaseIndex, Index, MetaIndexは辞書アプリの改良:部分一致(4) - Random Noteのままなので省略。


dict.groovy

def dictionary = new Dictionary("test")

key = args[0]
println "search: ${key}"

def timer = new Timer("All")
def resultList = dictionary.search(key)
for (idx in resultList) {
	println idx.word
	if (resultList.size() <= 10) {
		println "=> " + dictionary.getDefinition(idx)
	}
}
println "count:${resultList.size()}" 
timer.stop()

class Dictionary {
	def meta
	def file
	def Dictionary(String fileName) {
		meta = new MetaDictionary(fileName)
		file = new RandomAccessFile(fileName + ".idx", "r")
	}
	
	def search(key) {
		def keyFragments = cut(key)
		def metaIndexesList = []
		for (fragment in keyFragments) {
			metaIndexesList << meta.search(fragment.word)
		}
		if (metaIndexesList.empty) return []
		// 計算量が少なくなるよう、数が少ない順に並び替え
		metaIndexesList.sort {l, r -> return r.size() <=> l.size()}
		def andList = metaIndexesList.get(0).clone()
		int i=1
		while (i<metaIndexesList.size()) {
			def mapA = new HashMap()
			def mapB = new HashMap()
			metaIndexesList.get(i).each {metaIndex -> append(mapA, metaIndex)}
			andList.each {metaIndex ->
					if (mapA.containsKey(metaIndex.offset)) append(mapB, metaIndex)
				}
			def tmpList = []
			mapB.keySet().each {offset ->
					tmpList.addAll(mapA.get(offset))
					tmpList.addAll(mapB.get(offset))
				}
			andList = tmpList
			i++
		}
		def resultList = []
		def map = new HashMap()
		andList.each {append(map, it)}
		
		map.keySet().each {offsetKey ->
				def metaList = map.get(offsetKey)
				def baseList = metaList.findAll {x -> x.word == keyFragments[0].word}
				baseList.each {base ->
						def flag = true
						for (fragment in keyFragments) {
							flag = flag && 
								metaList.any {
									x -> fragment.word == x.word && (x.wordOffset - base.wordOffset) == fragment.wordOffset
								}
						}
						if (flag) resultList << getIndex(base.offset, base.length)
					}
			}
		resultList.sort {l, r-> l.word <=> r.word}
		return resultList
	}
	
	def append(Map map, MetaIndex idx) {
		if (map.containsKey(idx.offset)) {
			map.get(idx.offset) << idx
		} else {
			map.put(idx.offset, [idx])
		}
	}
	
	def cut(string) {
		if (string.length() < MetaIndex.KEY_CHAR_LENGTH) return [new MetaIndex(string, 0, 0, 0)]
		def list = []
		int i=0
		while (i <string.length()-MetaIndex.KEY_CHAR_LENGTH +1) {
			list << new MetaIndex(string.substring(i, i+MetaIndex.KEY_CHAR_LENGTH), 0, 0, i)
			i++
		}
		return list
	}
	
	def getIndex(int offset, int length) {
		def buf = new byte[length]
		file.seek(offset)
		file.read(buf)
		return new Index(buf, 0, length)
	}
	
	def getDefinition(index) {
		def defIn = new FileInputStream("test.dict")
		def buf = new byte[index.length]
		RandomAccessFile randamFile = new RandomAccessFile("test.dict", "r")
		randamFile.seek(index.offset)
		randamFile.read(buf)
		randamFile.close()
		return new String(buf, "UTF-8")
	}
}

class MetaDictionary {
	def file = null
	int count = 0
	def MetaDictionary(fileName) {
		file = new RandomAccessFile(fileName + ".meta", "r")
		count = file.length() / MetaIndex.LENGTH
	}

	def search(key) {
		assert (key.length() <= MetaIndex.KEY_CHAR_LENGTH)
		int start = searchStart(key, 0, count)
		int end = searchEnd(key, start, count)
		if (start > end) return loadIndex(0, 0)
		return loadIndex(start, (end-start+1))
	}

	def getIndex(int i) {
		int len = MetaIndex.LENGTH
		def buf = new byte[len]
		file.seek(i * MetaIndex.LENGTH)
		file.read(buf)
		return new MetaIndex(buf)
	}
	
	def loadIndex(indexOffset = 0, indexCount = -1) {
		if (indexOffset < 0) indexOffset = 0
		if (indexCount == -1 || file.length() < (indexCount * MetaIndex.LENGTH)) indexCount = count
		int len = (indexCount * MetaIndex.LENGTH)
		byte[] buf = new byte[len]
		file.seek(indexOffset * MetaIndex.LENGTH)
		file.read(buf)

		int i=0
		def indexes = [];
		while (i<indexCount) {
			indexes << new MetaIndex(buf, i * MetaIndex.LENGTH)
			i ++
		}
		return indexes
	}
	def searchStart(key, int start, int end) {
		if (start == end || end < 0) return start
		int i = (int)Math.floor((start+end)/2)
		def idx = getIndex(i)
		def c = idx.compareTo(key)
//		println "start:${start}, end:${end}, i:${i}, c:${c}, idx:${idx}, key:${key}, key:${key.getBytes()}"
		if (c >= 0) {
			return searchStart(key, start, i)
		} else {
			if ((end-start)==1) i = end
			return searchStart(key, i, end)
		}
	}
	def searchEnd(key, start, end) {
		if (start == end || end < 0) return end
		def i = (int)Math.ceil((start+end)/2)
		if (i>end) i = end
		def idx = getIndex(i)
		def c = idx.compareTo(key)
//		println "start:${start}, end:${end}, i:${i}, c:${c}, idx:${idx}, key:${key}, key:${key.getBytes()}"
		if (c <= 0) {
			return searchEnd(key, i, end)
		} else {
			if ((end-start)==1) i = start
			return searchEnd(key, start, i)
		}
	}
}

class Timer {
	def name
	def time = new Date()
	def Timer(n) {name = n}
	def stop() {
		println "TIME[${name}]:${(new Date().getTime() - time.getTime())}" 
	}
}