import xml.etree.cElementTree as ET

tree = ET.ElementTree(file='./JMdict_example.xml')
tree

<ElementTree at 0x3d630d0>

root = tree.getroot()
root

<Element 'entry' at 0x03DC45D8>

for elem in root:
    print elem

<Element 'ent_seq' at 0x03DC4608>
<Element 'k_ele' at 0x03DC4488>
<Element 'r_ele' at 0x03DC4548>
<Element 'sense' at 0x03DC4770>
<Element 'sense' at 0x03DC4998>

print root.find('r_ele/reb').text

うよく

print [elem.text for elem in root.findall('sense/pos')]

['adj-no;', 'n;']

def extract_information(elem):
    return (elem.find('r_ele/reb').text,
            [sense.text for sense in elem.findall('sense/pos')])

extract_information(root)

(u'\u3046\u3088\u304f', ['adj-no;', 'n;'])

tree = ET.ElementTree(file='../JMdict.xml')
tree

<ElementTree at 0x3d63bf0>

verb_list = []
verb_types = [u'Ichidan', u'Nidan', u'Yodan', u'Godan']
for elem in tree.getroot().findall('entry'):
    item_info = extract_information(elem)
    for verb_type in verb_types:
        if " ".join(item_info[1]).find(verb_type) != -1:
                verb_list.append((item_info[0], verb_type))
                break

len(verb_list)

10266

verb_list[:10]

[(u'\u3042\u3052\u3064\u3089\u3046', u'Godan'),
 (u'\u3042\u3057\u3089\u3046', u'Godan'),
 (u'\u3042\u3076\u308c\u308b', u'Ichidan'),
 (u'\u3042\u3084\u3059', u'Godan'),
 (u'\u3044\u304b\u3059', u'Godan'),
 (u'\u3044\u3058\u3051\u308b', u'Ichidan'),
 (u'\u3044\u3058\u308a\u307e\u308f\u3059', u'Godan'),
 (u'\u3044\u3061\u3083\u3064\u304f', u'Godan'),
 (u'\u3044\u306a\u306a\u304f', u'Godan'),
 (u'\u3044\u3073\u308b', u'Godan')]

for elem in map(": ".join, verb_list[:10]):
    print elem

あげつらう: Godan
あしらう: Godan
あぶれる: Ichidan
あやす: Godan
いかす: Godan
いじける: Ichidan
いじりまわす: Godan
いちゃつく: Godan
いななく: Godan
いびる: Godan

for verb_type in verb_types:
    print "%s: %i out of %i verbs" % (verb_type, len(filter(lambda v: v[1] == verb_type, verb_list)), len(verb_list))

Ichidan: 3415 out of 10266 verbs
Nidan: 38 out of 10266 verbs
Yodan: 25 out of 10266 verbs
Godan: 6788 out of 10266 verbs

for verb_type in verb_types:
    print "%s: %.1f percent" % (verb_type, len(filter(lambda v: v[1] == verb_type, verb_list)) / float(len(verb_list)) * 100)

Ichidan: 33.3 percent
Nidan: 0.4 percent
Yodan: 0.2 percent
Godan: 66.1 percent

last_char_dict = {}
for verb in verb_list:
    last_char = verb[0][-1] 
    if last_char not in last_char_dict:
        last_char_dict[last_char] = {verb[1] : 1}
    else:
        if verb[1] in last_char_dict[last_char]:
            last_char_dict[last_char][verb[1]] += 1
        else:
            last_char_dict[last_char][verb[1]] = 1

last_char_dict

{u'\u3046': {u'Godan': 719, u'Nidan': 6, u'Yodan': 4},
 u'\u304f': {u'Godan': 911, u'Nidan': 2, u'Yodan': 4},
 u'\u3050': {u'Godan': 137, u'Nidan': 2},
 u'\u3059': {u'Godan': 1803, u'Nidan': 1, u'Yodan': 3},
 u'\u305a': {u'Ichidan': 1, u'Nidan': 2},
 u'\u3064': {u'Godan': 244, u'Nidan': 2, u'Yodan': 1},
 u'\u306c': {u'Godan': 9, u'Nidan': 2},
 u'\u3075': {u'Nidan': 1},
 u'\u3076': {u'Godan': 102, u'Nidan': 2, u'Yodan': 2},
 u'\u3080': {u'Godan': 596, u'Nidan': 4},
 u'\u3086': {u'Nidan': 7},
 u'\u308b': {u'Godan': 2267, u'Ichidan': 3414, u'Nidan': 7, u'Yodan': 11}}

", ".join(map(lambda s: ": ".join((s[0], str(s[1]))), last_char_dict[last_char_dict.keys()[0]].items()))

u'Nidan: 4, Godan: 596'

for key in last_char_dict:
    print "verbs ending in %s" % key, ", ".join(map(lambda s: ": ".join((s[0], str(s[1]))), last_char_dict[key].items()))

verbs ending in む Nidan: 4, Godan: 596
verbs ending in つ Yodan: 1, Nidan: 2, Godan: 244
verbs ending in う Yodan: 4, Nidan: 6, Godan: 719
verbs ending in る Yodan: 11, Nidan: 7, Ichidan: 3414, Godan: 2267
verbs ending in ゆ Nidan: 7
verbs ending in ぬ Nidan: 2, Godan: 9
verbs ending in く Yodan: 4, Nidan: 2, Godan: 911
verbs ending in ぐ Nidan: 2, Godan: 137
verbs ending in ふ Nidan: 1
verbs ending in ぶ Yodan: 2, Nidan: 2, Godan: 102
verbs ending in す Yodan: 3, Nidan: 1, Godan: 1803
verbs ending in ず Nidan: 2, Ichidan: 1

second_last_char_dict = {}
for verb in verb_list:
    last_char = verb[0][-1] 
    if last_char == u'る' and len(verb[0]) >= 2:
        second_last_char = verb[0][-2] 
        if second_last_char not in second_last_char_dict:
            second_last_char_dict[second_last_char] = {verb[1] : 1}
        else:
            if verb[1] in second_last_char_dict[second_last_char]:
                second_last_char_dict[second_last_char][verb[1]] += 1
            else:
                second_last_char_dict[second_last_char][verb[1]] = 1

for key in second_last_char_dict:
    print key, second_last_char_dict[key]

ゃ {u'Yodan': 2, u'Godan': 5}
わ {u'Godan': 130}
ェ {u'Godan': 1}
カ {u'Godan': 1}
ク {u'Godan': 5}
コ {u'Godan': 4}
シ {u'Godan': 3}
い {u'Ichidan': 32, u'Godan': 76}
え {u'Yodan': 1, u'Ichidan': 398, u'Godan': 41}
ニ {u'Godan': 3}
が {u'Nidan': 1, u'Godan': 162}
ぐ {u'Godan': 25}
ビ {u'Godan': 1}
ご {u'Nidan': 1, u'Godan': 3}
じ {u'Ichidan': 104, u'Godan': 21}
ぜ {u'Ichidan': 11}
ミ {u'Godan': 1}
だ {u'Nidan': 1, u'Godan': 18}
ャ {u'Godan': 1}
つ {u'Godan': 34}
ョ {u'Godan': 1}
と {u'Godan': 161}
ぬ {u'Godan': 6}
ば {u'Godan': 45}
へ {u'Ichidan': 3, u'Godan': 5}
ぼ {u'Godan': 32}
む {u'Godan': 13}
や {u'Godan': 28}
よ {u'Godan': 36}
れ {u'Ichidan': 548, u'Godan': 1}
グ {u'Godan': 5}
ジ {u'Godan': 2}
ツ {u'Godan': 1}
ト {u'Godan': 2}
か {u'Yodan': 2, u'Godan': 149}
く {u'Godan': 69}
バ {u'Godan': 2}
こ {u'Godan': 35}
ピ {u'Godan': 2}
し {u'Ichidan': 1, u'Godan': 37}
せ {u'Ichidan': 297, u'Godan': 9}
ボ {u'Godan': 2}
た {u'Godan': 72}
ム {u'Godan': 2}
で {u'Ichidan': 79, u'Godan': 1}
に {u'Ichidan': 3}
レ {u'Ichidan': 2}
は {u'Yodan': 1, u'Godan': 44}
び {u'Ichidan': 40, u'Godan': 7}
ほ {u'Godan': 6}
み {u'Ichidan': 61}
め {u'Ichidan': 470, u'Godan': 7}
ら {u'Ichidan': 1}
キ {u'Godan': 2}
ケ {u'Godan': 1}
ス {u'Godan': 4}
チ {u'Godan': 2}
あ {u'Yodan': 1, u'Godan': 28}
う {u'Godan': 13}
ド {u'Godan': 1}
お {u'Godan': 45}
ぎ {u'Ichidan': 29, u'Godan': 40}
パ {u'Godan': 1}
げ {u'Ichidan': 251, u'Godan': 5}
フ {u'Godan': 2}
ざ {u'Yodan': 2, u'Godan': 11}
ず {u'Ichidan': 79, u'Godan': 21}
ぞ {u'Godan': 3}
メ {u'Ichidan': 1}
て {u'Ichidan': 185, u'Godan': 3}
な {u'Nidan': 2, u'Godan': 147}
ロ {u'Godan': 3}
の {u'Godan': 31}
ひ {u'Ichidan': 3, u'Godan': 4}
ぶ {u'Nidan': 1, u'Godan': 70}
ま {u'Yodan': 2, u'Godan': 147}
も {u'Godan': 43}
ゆ {u'Godan': 1}
り {u'Ichidan': 24, u'Godan': 2}
ズ {u'Godan': 1}
テ {u'Godan': 2}
ナ {u'Godan': 1}
き {u'Ichidan': 30, u'Godan': 108}
け {u'Ichidan': 630, u'Godan': 22}
さ {u'Godan': 58}
ブ {u'Godan': 6}
す {u'Godan': 16}
そ {u'Nidan': 1, u'Godan': 10}
ち {u'Ichidan': 38, u'Godan': 9}
モ {u'Godan': 3}
づ {u'Godan': 2}
ど {u'Godan': 35}
ね {u'Ichidan': 61, u'Godan': 15}
ぱ {u'Godan': 5}
ふ {u'Godan': 18}
べ {u'Ichidan': 33, u'Godan': 9}

def vowel_type(character):
    vowels = ['a', 'i', 'u', 'e', 'o']
    chars = [u'あかさたなはまやらわゃカがだャばバパざナぱ', 
             u'シいニビじミジピしにびみキチぎひりきち',
             u'クぐつぬむグツくムスうフずぶゆズブすづふ',
             u'べェえぜへれせでレめケげメてテけね',
             u'コごョとぼよトこボほドおぞロのもそモど']
    for ind, hiragana in enumerate(chars):
        if character in hiragana:
            return vowels[ind]
    print character
    raise Exception('character not supported')

final_classification = {}
for key in second_last_char_dict:
    vowel = vowel_type(key)
    if vowel not in final_classification:
        final_classification[vowel] = second_last_char_dict[key]
    else:
        for verb_group in second_last_char_dict[key]:
            if verb_group in final_classification[vowel]:
                final_classification[vowel][verb_group] += second_last_char_dict[key][verb_group]
            else:
                final_classification[vowel][verb_group] = second_last_char_dict[key][verb_group]

final_classification

{'a': {u'Godan': 1055, u'Ichidan': 1, u'Nidan': 4, u'Yodan': 10},
 'e': {u'Godan': 122, u'Ichidan': 2969, u'Yodan': 1},
 'i': {u'Godan': 320, u'Ichidan': 365},
 'o': {u'Godan': 456, u'Nidan': 2},
 'u': {u'Godan': 314, u'Ichidan': 79, u'Nidan': 1}}

for key in final_classification.keys():
    print 'vowel sound ending in %s' % key
    print ", ".join(map(lambda s: ": ".join((s[0], str(s[1]))), final_classification[key].items()))

vowel sound ending in a
Yodan: 10, Nidan: 4, Ichidan: 1, Godan: 1055
vowel sound ending in u
Nidan: 1, Ichidan: 79, Godan: 314
vowel sound ending in e
Yodan: 1, Ichidan: 2969, Godan: 122
vowel sound ending in i
Ichidan: 365, Godan: 320
vowel sound ending in o
Nidan: 2, Godan: 456

for key in final_classification.keys():
    print 'vowel sound ending in %s' % key
    total = sum(map(lambda i: i[1], final_classification[key].items()))
    print ", ".join(map(lambda s: ": ".join((s[0], format(s[1] / float(total) * 100, ".2f"))), final_classification[key].items()))

vowel sound ending in a
Yodan: 0.93, Nidan: 0.37, Ichidan: 0.09, Godan: 98.60
vowel sound ending in u
Nidan: 0.25, Ichidan: 20.05, Godan: 79.70
vowel sound ending in e
Yodan: 0.03, Ichidan: 96.02, Godan: 3.95
vowel sound ending in i
Ichidan: 53.28, Godan: 46.72
vowel sound ending in o
Nidan: 0.44, Godan: 99.56

Analyzing verb endings in the Japanese language

Goal of this study¶

Sample file extraction¶

Where are the verbs?¶

First statistical breakdown¶

Verb types by last character¶

Concluding remarks¶

High certainty¶

Lower certainty¶

Side note¶

Comments