辞書アプリの改良:初期化処理の高速化(2)
ソース
メタインデックスを利用して、検索ごとにファイルからインデックスを読み込むようにソースを修正。
インデックスは一切事前にメモリに保持をしていない。
startCount() def dictionary = new Dictionary("test") println "init:" + stopCount() def input = new BufferedReader(new InputStreamReader(System.in)) while (true) { print "search: " key = input.readLine() startCount() def resultList = dictionary.search(key) for (idx in resultList) { println idx.word if (resultList.size() <= 10) { println "=> " + dictionary.getDefinition(idx) } } print "time:" + stopCount() println ", count:${resultList.size()}" } def time = new Date() def startCount() { time = new Date() } def stopCount() { return new Date().getTime() - time.getTime() } class Dictionary extends BaseDictionary { def meta def indexes def Dictionary(String fileName) { meta = new MetaDictionary(fileName) indexes = [] } def search(key) { indexes = meta.search(key) def start = searchStart(key, 0, indexes.size()-1) def end = searchEnd(key, 0, indexes.size()-1) if (indexes.size() > 0 && indexes.get(0).word == key) return [indexes.get(0)] return indexes.subList(start, end+1) } def getIndex(int i) { return indexes.get(i) } def getDefinition(index) { def defIn = new FileInputStream("test.dict") def buf = new byte[index.length] RandomAccessFile randamFile = new RandomAccessFile("test.dict", "r") randamFile.seek(index.offset) randamFile.read(buf) randamFile.close() return new String(buf, "UTF-8") } } class MetaDictionary extends BaseDictionary { static final int INDEX_LENGTH = 20 static final int INDEX_KEY_LENGTH = 12 static final int INDEX_KEY_CHAR_LENGTH = 2 def file = null def indexFile = null int count = 0 def MetaDictionary(fileName) { file = new RandomAccessFile(fileName + ".meta", "r") indexFile = new File("test.idx") count = file.length() / INDEX_LENGTH } def search(originalKey) { def key = originalKey if (originalKey.length() > INDEX_KEY_CHAR_LENGTH) key = originalKey.substring(0, INDEX_KEY_CHAR_LENGTH) int start = searchStart(key, 0, count-1) int end = searchEnd(key, 0, count-1) if (start > end) return loadCompleteIndex(0, 0) def startIndex = getIndex(start) def endIndex = getIndex(end); return loadCompleteIndex(startIndex.offset, endIndex.offset - startIndex.offset + endIndex.length) } def getIndex(int i) { def buf = new byte[INDEX_LENGTH] file.seek(i * INDEX_LENGTH) file.read(buf) return new MetaIndex(buf) } def loadCompleteIndex(indexOffset = 0, indexLength = -1) { if (indexOffset < 0) indexOffset = 0 if (indexLength == -1) indexLength = indexFile.length() def buf = new byte[indexLength] RandomAccessFile randamFile = new RandomAccessFile(indexFile, "r") randamFile.seek(indexOffset) randamFile.read(buf) randamFile.close() def i=0 int offset = 0 int count = 0 def indexes = new ArrayList(); while (i<buf.length) { if (buf[i] == 0) { def idx = new Index(buf, offset, i-offset+9) if (idx.word.length() > 0) indexes.add(idx) offset = i+9 i = i+8 count++ /* if (count > 1000) break*/ } i++ } return indexes } } class BaseDictionary { /* abstract BaseIndex getIndex(int i);*/ def searchStart(key, int start, int end) { if (start == end || end < 0) return start int i = (int)Math.floor((start+end)/2) def idx = getIndex(i) def c = idx.startsWith(key) /* println "start:${start}, end:${end}, i:${i}, c:${c}, idx:${idx}, key:${key}, key:${key.getBytes()}"*/ if (c >= 0) { return searchStart(key, start, i) } else { if ((end-start)==1) i = end return searchStart(key, i, end) } } def searchEnd(key, start, end) { if (start == end || end < 0) return end def i = (int)Math.ceil((start+end)/2) if (i>end) i = end def idx = getIndex(i) def c = idx.startsWith(key) /* println "start:${start}, end:${end}, i:${i}, c:${c}, idx:${idx}, key:${key}, key:${key.getBytes()}"*/ if (c <= 0) { return searchEnd(key, i, end) } else { if ((end-start)==1) i = start return searchEnd(key, start, i) } } } class BaseIndex { def word def offset def length def wordBuf def cut(bytes, o, l) { def i=0 def buf = new byte[l] while (i < l) { buf[i] = bytes[i+o] i++ } return buf } def parseInt(byte[] bytes) { def result = 0 def i = 0 for (b in bytes) { result += (b & 0xff) * (int)(Math.pow(256, bytes.length-i-1)) i++ } return result } int startsWith(String string) { int i=0 while (i<word.length() && i<string.length()) { if (word.charAt(i) != string.charAt(i)) return word.charAt(i) - string.charAt(i) i++ } return word.length() < string.length() ? -1 : 0 } String toString() { return "{word: ${word},\tbuf: {$wordBuf},\toffset:${offset},\tlength:${length}}" } } class MetaIndex extends BaseIndex { public static final int INDEX_LENGTH = 20 public static final int INDEX_KEY_LENGTH = 12 public static final int INDEX_KEY_CHAR_LENGTH = 2 def MetaIndex(buf) { wordBuf = cut(buf, 0, INDEX_KEY_LENGTH) word = new String(wordBuf, "UTF-8").trim() // 余計なバイト(0)をtrimで削除 wordBuf = word.getBytes("UTF-8") offset = parseInt(cut(buf, INDEX_KEY_LENGTH, 4)) length = parseInt(cut(buf, INDEX_KEY_LENGTH+4, 4)) } } class Index extends BaseIndex { def Index(byte[] buf, int o, int l) { wordBuf = cut(buf, o, l-9) word = new String(wordBuf, "UTF-8").trim() offset = parseInt(cut(buf, o+l-8, 4)) length = parseInt(cut(buf, o+l-4, 4)) } }
パフォーマンス
最初にメモリに読み込むパターンと、メタインデックス利用パターンで実行時間を計測してみた。
当然、メモリ読み込みパターンの方が圧倒的に早いが、メタインデックス利用パターンも100件程度の検索なら0.1秒程度で返ってきており、十分使い物になりそう。
メモリ読み込みパターンの初期化処理の遅さと、必要メモリの大きさを考えると、総合的にはメタインデックス利用パターンの方がよさそうだ。
「機」の検索では検索速度が逆転しているが、これはファイル読み込みにRandomAccessFileを使うようにしたため。RandomAccessFileを利用すれば、インデックスの出現位置は検索速度に影響を与えていないことがわかる。
メモリ読み込みパターン
init: | time:20127 |
search: あ | time:1026, count:2487 |
search: あい | time:50, count:1 |
search: あいし | time:8, count:14 |
search: あいして | time:10, count:5 |
search: 機 | time:130, count:84 |
メタインデックス利用パターン
init: | 187 |
search: あ | time:2547, count:2487 |
search: あい | time:247, count:1 |
search: あいし | time:143, count:14 |
search: あいして | time:132, count:5 |
search: 機 | time:104, count:84 |