辞書アプリの改良:複数辞書対応+α
関連エントリ
StarDictの辞書アプリ - Random Note
辞書アプリの改良 - Random Note
辞書アプリの改良:初期化処理の高速化(1) - Random Note
辞書アプリの改良:初期化処理の高速化(2) - Random Note
辞書アプリの改良:コマンド化 - Random Note
辞書アプリの改良:部分一致(1) - Random Note
辞書アプリの改良:部分一致(2) - Random Note
辞書アプリの改良:部分一致(3) - Random Note
辞書アプリの改良:部分一致(4) - Random Note
辞書アプリの改良:部分一致(5) - Random Note
改良点
- 複数辞書対応(同階層にある*.metaを全てメタインデックスファイルと見なす)
- ゲンゾウさん添削の取り込み
- 1インデックス内に同一連続文字があると検索結果が重複するバグを修正
- インデックスファイル、メタインデックスファイルのopen/closeのスコープを縮小(パフォーマンスにほぼ差異なし)
ソース
def dictionaries = [] new File(".").eachFileMatch(~".*.meta") { dictionaries << new Dictionary(it.name.substring(0, it.name.length()-5)) } key = args[0] println "search: ${key}" def timer = new Timer("All") def resultMap = [:] dictionaries.each {dictionary -> def resultList = dictionary.search(key) resultList.each {idx -> resultMap.get(idx.word, []) << [d:dictionary, i:idx] } } resultMap.keySet().sort({l,r -> l.compareTo(r)}).each {word -> println word if (resultMap.size() <= 100) { resultMap.get(word).each {map -> println "=> " + map.d.getDefinition(map.i) } } } println "count:${resultMap.size()}" timer.stop() class Dictionary extends BaseDictionary { def meta def file def dictFile def Dictionary(String fileName) { meta = new MetaDictionary(fileName) file = new File(fileName + ".idx") dictFile = new File(fileName + ".dict") } def search(key) { def timer = new Timer("meta search") def keyFragments = cut(key) def metaIndexesList = [] for (fragment in keyFragments) { def t = new Timer("meta search each") metaIndexesList << meta.search(fragment.word) t.stop() } timer.stop() if (metaIndexesList.empty) return [] timer = new Timer("weak intersection") // 計算量が少なくなるよう、数が少ない順に並び替え metaIndexesList.sort {l, r -> return r.size() <=> l.size()} def andList = metaIndexesList.get(0).clone() (1..<metaIndexesList.size()).each{i-> def mapA = [:] def mapB = [:] metaIndexesList.get(i).each {metaIndex -> mapA.get(metaIndex.offset, []) << metaIndex} andList.each {metaIndex -> if (mapA.containsKey(metaIndex.offset)) mapB.get(metaIndex.offset, []) << metaIndex } def tmpList = [] mapB.each {mapKey, value -> tmpList.addAll(mapA.get(mapKey)) tmpList.addAll(value) } andList = tmpList } timer.stop() // 文字の出現位置をチェック timer = new Timer("filter") def map = [:] andList.each { map.get(it.offset, []) << it } def resultMap = [:] map.keySet().each {offsetKey -> def metaList = map.get(offsetKey) def baseList = metaList.findAll {x -> x.word == keyFragments[0].word} //検索文字の最初の2文字を基準にする baseList.each {base -> def flag = true for (fragment in keyFragments) { flag = flag && metaList.any { x -> fragment.word == x.word && (x.wordOffset - base.wordOffset) == fragment.wordOffset } } if (flag) resultMap.get(base.offset, base) } } timer.stop() def resultList = resultMap.collect {k, v -> getIndex(v.offset, v.length)} resultList.sort {l, r-> l.word <=> r.word} return resultList } def append(Map map, MetaIndex idx) { if (map.containsKey(idx.offset)) { map.get(idx.offset) << idx } else { map.put(idx.offset, [idx]) } } def cut(string) { if (string.length() < MetaIndex.KEY_CHAR_LENGTH) return [new MetaIndex(string, 0, 0, 0)] def list = [] int i=0 while (i <string.length()-MetaIndex.KEY_CHAR_LENGTH +1) { list << new MetaIndex(string.substring(i, i+MetaIndex.KEY_CHAR_LENGTH), 0, 0, i) i++ } return list } def getIndex(int offset, int length) { return new Index(read(file, offset, length), 0, length) } def getDefinition(index) { return new String(read(dictFile, index.offset, index.length), "UTF-8") } } class MetaDictionary extends BaseDictionary { def file = null int count = 0 def MetaDictionary(fileName) { file = new File(fileName + ".meta") count = file.length() / MetaIndex.LENGTH } def search(key) { assert (key.length() <= MetaIndex.KEY_CHAR_LENGTH) int start = searchStart(key, 0, count) int end = searchEnd(key, start, count) if (start > end) return loadIndex(0, 0) return loadIndex(start, (end-start+1)) } def getIndex(int i) { return new MetaIndex(read(file, i*MetaIndex.LENGTH, MetaIndex.LENGTH)) } def loadIndex(indexOffset = 0, indexCount = -1) { if (indexOffset < 0) indexOffset = 0 if (indexCount == -1 || file.length() < (indexCount * MetaIndex.LENGTH)) indexCount = count byte[] buf = read(file, indexOffset * MetaIndex.LENGTH, indexCount * MetaIndex.LENGTH) def indexes = (0..<indexCount).collect { new MetaIndex(buf, it * MetaIndex.LENGTH) } return indexes } def searchStart(key, int start, int end) { if (start == end || end < 0) return start int i = (int)Math.floor((start+end)/2) def idx = getIndex(i) def c = idx.compareTo(key) // println "start:${start}, end:${end}, i:${i}, c:${c}, idx:${idx}, key:${key}, key:${key.getBytes()}" if (c >= 0) { return searchStart(key, start, i) } else { if ((end-start)==1) i = end return searchStart(key, i, end) } } def searchEnd(key, start, end) { if (start == end || end < 0) return end def i = (int)Math.ceil((start+end)/2) if (i>end) i = end def idx = getIndex(i) def c = idx.compareTo(key) // println "start:${start}, end:${end}, i:${i}, c:${c}, idx:${idx}, key:${key}, key:${key.getBytes()}" if (c <= 0) { return searchEnd(key, i, end) } else { if ((end-start)==1) i = start return searchEnd(key, start, i) } } } class BaseDictionary { byte[] read(File file, long offset, long length) { byte[] buf = new byte[length] def raf = new RandomAccessFile(file, "r") raf.seek(offset) raf.read(buf) raf.close() return buf } } class Timer { def name def time = new Date() def Timer(n) {name = n} def stop() { println "TIME[${name}]:${(new Date().getTime() - time.getTime())}" } }
実行結果
2つの辞書にあるキーワードがなかなか見つからん。
やっぱりこのタイ語辞書は変な言葉ばかりだ
% ./dict.sh 辞書 search: 辞書 TIME[meta search each]:225 TIME[meta search]:259 TIME[weak intersection]:49 TIME[filter]:130 TIME[meta search each]:44 TIME[meta search]:44 TIME[weak intersection]:1 TIME[filter]:5 この辞書は、ソムシィー先生によって編纂されました。 => พจนานุกรมเล่มนี้แต่งโดยอาจารย์สมศรี どんな辞書を使っているの? => ใช้พจนานุกรมแบบไหน はい。(男性の丁寧語・若い人は、ほとんどR音を発音しません、Rを発音しない方が多数派です。本辞書は一応、khrapと正式な発音記号をいれています) => ครับ アルファベットの辞書 => อักขรานุกรม ウエブスター(1758〜1843米国の辞書編纂家) => เว็บสเตอร์ 人名辞書 => [じんめいじしょ] /biographical dictionary/ 何冊(本・書籍・辞書・雑誌・ノート) => เล่ม 別の辞書に当たって見る => [べつのじしょにあたってみる] /(exp) to try another dictionary/ 和英辞書 => [わえいじしょ] /(n) Japanese-English dictionary/ 日葡辞書 => [にっぽじしょ] /Japanese-Portuguese dictionary/ 英々辞書 => [えいえいじしょ] /(n) English-English dictionary/ 英英辞書 => [えいえいじしょ] /(n) English-English dictionary/ 辞書 => [じしょ] /(n) dictionary/lexicon/(P)/ => ดิค, พจนานุกรม, ศัพท์แสง 辞書に拠れば => [じしょによれば] /(exp) based on (according to) the dictionary/ 辞書を出す => [じしょをだす] /(exp) to publish a dictionary/ 辞書を引く => เปิดดิค 辞書を引くのは面倒だ。 => การจะเปิดพจนานุกรมเป็นเรื่องยุ่งยาก 辞書を繰る => [じしょをくる] /(exp) to consult a dictionary/ 辞書部門 => [じしょぶもん] /(n) lexicon/ 辞書類 => [じしょるい] /(n) dictionaries (and similar books)/ 電子辞書 => [でんしじしょ] /electronic dictionary/ => พจนานุกรมอิเล็กทรอนิกส์ count:21 TIME[All]:770
メタインデックス作成ツールも対象ファイルを指定できるよう修正。
Indexer.groovy
fileName = args[0] workDir = new File("work") deleteTmpFile() workDir.mkdirs() def indexes = loadCompleteIndex() List metaList = [] int i=0 for (idx in indexes) { metaList.addAll(idx.toMetaIndexList()) i++ if (i%1000 == 0) { makeTmpFile(metaList) metaList = [] println ("parsing:" + (i*100/indexes.size()) + "%") } } makeTmpFile(metaList) makeMetaFile() def makeMetaFile() { File outFile = new File(fileName + ".meta") outFile.delete() outFile.createNewFile() def os = new BufferedOutputStream(new FileOutputStream(outFile)) listTmpFiles().each {f -> println ("writing: " + hex2str(f.name) + ":" + f.length() + ":" + (f.length() / MetaIndex.LENGTH)) def buf = f.readBytes() def list = [] def i = 0 while ((i*MetaIndex.LENGTH) < buf.length) { list << new MetaIndex(buf, i*MetaIndex.LENGTH) i++ } list.sort() i = 0 for (m in list) { os.write(m.toBytes()) i++ } } os.close() } def listTmpFiles() { def list = workDir.listFiles().toList() return list.sort {l,r -> return hex2str(l.name).compareTo(hex2str(r.name)) } } def makeTmpFile(metaList) { metaList.sort() def prev = str2Hex("a") def tmpFile = new BufferedOutputStream(new FileOutputStream(new File(workDir, prev), true)) for (m in metaList) { def first = str2Hex(m.word.substring(0,1)) if (prev != first) { prev = first tmpFile.close() tmpFile = new BufferedOutputStream(new FileOutputStream(new File(workDir, prev), true)) } tmpFile.write(m.toBytes()) } tmpFile.close() } def str2Hex(String val) { byte[] bytes = val.getBytes() def result = "" bytes.each { result += Integer.toHexString(it & 0xff) } return result } def hex2str(val) { byte[] buf = new byte[(val.length() / 2)] int i=0 while (val.length() > 0) { buf[i] = Byte.parseByte(val.substring(0, 1), 16) * 16 + Byte.parseByte(val.substring(1, 2), 16) val = val.substring(2) i++ } return new String(buf, "UTF-8") } def deleteTmpFile() { workDir.listFiles().each { it.delete() } workDir.delete() } def loadCompleteIndex(indexOffset = 0, indexLength = -1) { if (indexOffset < 0) indexOffset = 0 RandomAccessFile randomFile = new RandomAccessFile(fileName + ".idx", "r") if (indexLength == -1) indexLength = randomFile.length() def buf = new byte[indexLength] randomFile.seek(indexOffset) randomFile.read(buf) randomFile.close() def i=0 int offset = 0 int count = 0 def indexes = new ArrayList(); while (i<buf.length) { if (buf[i] == 0) { def idx = new Index(buf, offset, i-offset+9) if (idx.word.length() > 0) indexes.add(idx) offset = i+9 i = i+8 count++ } i++ if (i%200000 == 0) println ("loading:" + (i*100/(indexLength)) + "%") } return indexes }