辞書アプリの改良:初期化処理の高速化(2)

ソース

メタインデックスを利用して、検索ごとにファイルからインデックスを読み込むようにソースを修正。
インデックスは一切事前にメモリに保持をしていない。

startCount()
def dictionary = new Dictionary("test")
println "init:" + stopCount()

def input = new BufferedReader(new InputStreamReader(System.in))
while (true) {
	print "search: "
	key = input.readLine()
	
	startCount()
	def resultList = dictionary.search(key)
	for (idx in resultList) {
		println idx.word
		if (resultList.size() <= 10) {
			println "=> " + dictionary.getDefinition(idx)
		}
	}
	print "time:" + stopCount()
	println ", count:${resultList.size()}" 
}

def time = new Date()
def startCount() {
	time = new Date()
}
def stopCount() {
	return new Date().getTime() - time.getTime()
}

class Dictionary extends BaseDictionary {
	def meta
	def indexes
	def Dictionary(String fileName) {
		meta = new MetaDictionary(fileName)
		indexes = []
	}
	
	def search(key) {
		indexes = meta.search(key)
		def start = searchStart(key, 0, indexes.size()-1)
		def end = searchEnd(key, 0, indexes.size()-1)
		
		if (indexes.size() > 0 && indexes.get(0).word == key) return [indexes.get(0)]

		return indexes.subList(start, end+1)
	}
	
	def getIndex(int i) {
		return indexes.get(i)
	}
	
	def getDefinition(index) {
		def defIn = new FileInputStream("test.dict")
		def buf = new byte[index.length]
		RandomAccessFile randamFile = new RandomAccessFile("test.dict", "r")
		randamFile.seek(index.offset)
		randamFile.read(buf)
		randamFile.close()
		return new String(buf, "UTF-8")
	}
}

class MetaDictionary extends BaseDictionary {
	static final int INDEX_LENGTH = 20
	static final int INDEX_KEY_LENGTH = 12
	static final int INDEX_KEY_CHAR_LENGTH = 2
	def file = null
	def indexFile = null
	int count = 0
	def MetaDictionary(fileName) {
		file = new RandomAccessFile(fileName + ".meta", "r")
		indexFile = new File("test.idx")
		count = file.length() / INDEX_LENGTH
	}

	def search(originalKey) {
		def key = originalKey
		if (originalKey.length() > INDEX_KEY_CHAR_LENGTH) key = originalKey.substring(0, INDEX_KEY_CHAR_LENGTH)
		int start = searchStart(key, 0, count-1)
		int end = searchEnd(key, 0, count-1)
		if (start > end) return loadCompleteIndex(0, 0)
		def startIndex = getIndex(start)
		def endIndex = getIndex(end);
		return loadCompleteIndex(startIndex.offset, endIndex.offset - startIndex.offset + endIndex.length)
	}
	
	def getIndex(int i) {
		def buf = new byte[INDEX_LENGTH]
		file.seek(i * INDEX_LENGTH)
		file.read(buf)
		return new MetaIndex(buf)
	}

	def loadCompleteIndex(indexOffset = 0, indexLength = -1) {
		if (indexOffset < 0) indexOffset = 0
		if (indexLength == -1) indexLength = indexFile.length()
		def buf = new byte[indexLength]
		RandomAccessFile randamFile = new RandomAccessFile(indexFile, "r")
		randamFile.seek(indexOffset)
		randamFile.read(buf)
		randamFile.close()

		def i=0
		int offset = 0
		int count = 0
		def indexes = new ArrayList();
		while (i<buf.length) {
			if (buf[i] == 0) {
				def idx = new Index(buf, offset, i-offset+9)
				if (idx.word.length() > 0) indexes.add(idx)
				offset = i+9
				i = i+8
				count++
/*				if (count > 1000) break*/
			}
			i++
		}
		return indexes
	}
}

class BaseDictionary {
/*	abstract BaseIndex getIndex(int i);*/
	
	def searchStart(key, int start, int end) {
		if (start == end || end < 0) return start
		int i = (int)Math.floor((start+end)/2)
		def idx = getIndex(i)
		def c = idx.startsWith(key)
/*		println "start:${start}, end:${end}, i:${i}, c:${c}, idx:${idx}, key:${key}, key:${key.getBytes()}"*/
		if (c >= 0) {
			return searchStart(key, start, i)
		} else {
			if ((end-start)==1) i = end
			return searchStart(key, i, end)
		}
	}
	def searchEnd(key, start, end) {
		if (start == end || end < 0) return end
		def i = (int)Math.ceil((start+end)/2)
		if (i>end) i = end
		def idx = getIndex(i)
		def c = idx.startsWith(key)
/*		println "start:${start}, end:${end}, i:${i}, c:${c}, idx:${idx}, key:${key}, key:${key.getBytes()}"*/
		if (c <= 0) {
			return searchEnd(key, i, end)
		} else {
			if ((end-start)==1) i = start
			return searchEnd(key, start, i)
		}
	}
}

class BaseIndex {
	def word
	def offset
	def length
	def wordBuf

	def cut(bytes, o, l) {
		def i=0
		def buf = new byte[l]
		while (i < l) {
			buf[i] = bytes[i+o]
			i++
		}
		return buf
	}

	def parseInt(byte[] bytes) {
		def result = 0
		def i = 0
		for (b in bytes) {
			result += (b & 0xff) * (int)(Math.pow(256, bytes.length-i-1))
			i++
		}
		return result
	}
	int startsWith(String string) {
		int i=0
		while (i<word.length() && i<string.length()) {
			if (word.charAt(i) != string.charAt(i)) return word.charAt(i) - string.charAt(i)
			i++
		}
		return word.length() < string.length() ? -1 : 0
	}
	String toString() {
		return "{word: ${word},\tbuf: {$wordBuf},\toffset:${offset},\tlength:${length}}"
	}
}
class MetaIndex extends BaseIndex {
	public static final int INDEX_LENGTH = 20
	public static final int INDEX_KEY_LENGTH = 12
	public static final int INDEX_KEY_CHAR_LENGTH = 2
	def MetaIndex(buf) {
		wordBuf = cut(buf, 0, INDEX_KEY_LENGTH)
		word = new String(wordBuf, "UTF-8").trim()	// 余計なバイト(0)をtrimで削除
		wordBuf = word.getBytes("UTF-8")
		offset = parseInt(cut(buf, INDEX_KEY_LENGTH, 4))
		length = parseInt(cut(buf, INDEX_KEY_LENGTH+4, 4))
	}
}
class Index extends BaseIndex {
	def Index(byte[] buf, int o, int l) {
		wordBuf = cut(buf, o, l-9)
		word = new String(wordBuf, "UTF-8").trim()
		offset = parseInt(cut(buf, o+l-8, 4))
		length = parseInt(cut(buf, o+l-4, 4))
	}
}

パフォーマンス

最初にメモリに読み込むパターンと、メタインデックス利用パターンで実行時間を計測してみた。
当然、メモリ読み込みパターンの方が圧倒的に早いが、メタインデックス利用パターンも100件程度の検索なら0.1秒程度で返ってきており、十分使い物になりそう。
メモリ読み込みパターンの初期化処理の遅さと、必要メモリの大きさを考えると、総合的にはメタインデックス利用パターンの方がよさそうだ。
「機」の検索では検索速度が逆転しているが、これはファイル読み込みにRandomAccessFileを使うようにしたため。RandomAccessFileを利用すれば、インデックスの出現位置は検索速度に影響を与えていないことがわかる。

メモリ読み込みパターン
init: time:20127
search: あ time:1026, count:2487
search: あい time:50, count:1
search: あいし time:8, count:14
search: あいして time:10, count:5
search: 機 time:130, count:84
メタインデックス利用パターン
init: 187
search: あ time:2547, count:2487
search: あい time:247, count:1
search: あいし time:143, count:14
search: あいして time:132, count:5
search: 機 time:104, count:84