import urllib2
response = urllib2.urlopen('http://www.csc.fi/english/research/sciences/linguistics/taajuussanasto-B9996/download')
words = response.read()

words = words.decode('utf-8')

words = words.splitlines()

words[:10]

[u'                   Sanahakemisto (laskevan taajuuden mukaan)',
 u'',
 u'   N        Abs   Rel    Uppslagsord',
 u'   1    2716396 4,614851 olla (verbi)',
 u'   2    1566108 2,660641 ja (konjunktio)',
 u'   3     593462 1,008225 ei (verbi)',
 u'   4     538609 0,915036 se (pronomini)',
 u'   5     443301 0,753118 ett\xe4 (konjunktio)',
 u'   6     417984 0,710108 joka (pronomini)',
 u'   7     344927 0,585992 vuosi (substantiivi)']

words[10]

u'   8     302803 0,514428 h\xe4n (pronomini)'

print words[10]

   8     302803 0,514428 hän (pronomini)

rank = lambda w: int(w[:8])
rank(words[10])

8

abs_count = lambda w: int(w[8:15])
abs_count(words[10])

302803

rel_count = lambda w: float(w[15:25].replace(',', '.'))
rel_count(words[10])

0.514428

the_word = lambda w: w[25:].split('(')[0]
print the_word(words[10])

hän

word_dict = dict(
    [(the_word(w), 
      (rank(w), abs_count(w), rel_count(w))) for w in words[3:-6]])

from IPython.html.widgets import interact

from IPython.display import HTML, display

def show_word(n):
    word = word_dict.keys()[n]
    s = '<h3>Word: %s</h3><table>\n' % word
    for k,v in zip(('rank', 'relative count', 'absolute count'),
                   word_dict[word]):
        s += '<tr><td>{0}</td><td>{1}</td></tr>\n'.format(k,v)
    s += '</table>'
    display(HTML(s))

show_word(3)

interact(show_word,
         n=(0, len(word_dict.keys()) - 1))

<function __main__.show_word>

from IPython.display import IFrame
IFrame('http://www.fincd.com/index.php?txtSearch=tunti&lang=fi', width='100%', height=350)

my_word = u'kyllä'

my_word.encode('iso-8859-1')

'kyll\xe4'

urllib2.quote(my_word.encode('iso-8859-1'))

'kyll%E4'

print 'http://www.fincd.com/index.php?txtSearch=%s&lang=fi' % urllib2.quote(my_word.encode('iso-8859-1'))

http://www.fincd.com/index.php?txtSearch=kyll%E4&lang=fi

def show_word_and_translation(n):
    word = word_dict.keys()[n]
    s = '<h3>Word: %s</h3><table>\n' % word
    for k,v in zip(('rank', 'relative count', 'absolute count'),
                   word_dict[word]):
        s += '<tr><td>{0}</td><td>{1}</td></tr>\n'.format(k,v)
    s += '</table>'
    display(HTML(s))
    url = 'http://www.fincd.com/index.php?txtSearch=%s&lang=fi' % urllib2.quote(word[:-1].encode('iso-8859-1'))
    display(IFrame(url, width='100%', height=350))

interact(show_word_and_translation,
         n=(0, len(word_dict.keys()) - 1))

response = urllib2.urlopen('http://www.fincd.com/finnish/kyll%E4.html')
source = response.read()

source_split = source.decode("iso-8859-1").splitlines()

source_split[85:120]

[u'\t<a href="/friends/">[Links]</a>&nbsp;',
 u'\t<a href="javascript:bookmark()">[Bookmark]</a>',
 u'  </td></tr>',
 u'</table>',
 u'',
 u'<table width="728" align="center" id="tbDotBorder">',
 u'  <tr>',
 u'    <td id="lang_cell" width="20%">Finnish:</td>',
 u'    <td id="helper_cell" width="80%"><a href = "/finnish/kyll%E4.html">kyll\xe4</a>\t',
 u'\t</td>',
 u'  </tr>',
 u'  <tr>',
 u'    <td id="lang_cell" width="20%">English:</td>',
 u'    <td id="content_cell" width="80%">',
 u'\t<li><a href="/english/yes.html">yes</a></li>',
 u'\t<p id="msg"></p>\t</td>',
 u'  </tr>',
 u'  <tr>',
 u'    <td colspan="2" id="suggestion_cell"><!--Write your own explain here--></td>',
 u'  </tr>',
 u'  <tr>',
 u'  <td colspan="2">',
 u'  <table width="100%">',
 u'    <td width="50%" id="discuss_cell"> </td>',
 u'    <td width="50%" id="discuss_cell"> </td>',
 u'  </tr>',
 u'  </table>',
 u'  </td>',
 u'  <tr><td colspan="3" align="right" id="copy_right"><small><a href = "/old/" title="Suomi Englanti sanakirja ">Suomi Englanti Suomi sanakirja Beta5</a></small></td></tr>',
 u'</table>',
 u'',
 u'<br>',
 u'',
 u'<table width="728" align="center" id="tbDotBorder" style="border-style:none">',
 u'<tr>']

source_split.index(u'<table width="728" align="center" id="tbDotBorder">')

90

source_split.index(u'<table width="728" align="center" id="tbDotBorder" style="border-style:none">')

118

HTML("".join(source_split[90:118]))

src = "".join(source_split[90:118])

import re

p = re.compile('<tr>')

iterator = p.finditer(src)
for match in iterator:
    print match.span()

(53, 57)
(201, 205)
(368, 372)
(461, 465)
(619, 623)

HTML(src[53:367])

def extract_word_definition(source):
    source_split = source.decode("iso-8859-1").splitlines()
    start = source_split.index(u'<table width="728" align="center" id="tbDotBorder">')
    stop = source_split.index(u'<table width="728" align="center" id="tbDotBorder" style="border-style:none">')
    src = "".join(source_split[start:stop])
    p = re.compile('<tr>')
    iterator = p.finditer(src)
    spans = [match.span() for match in iterator]
    start = spans[0][0]
    stop = spans[2][0]
    return src[start:stop]

HTML(extract_word_definition(source))

def show_word_and_translation_html_only(n):
    word = word_dict.keys()[n]
    s = '<h3>Word: %s</h3><table>\n' % word
    for k,v in zip(('rank', 'relative count', 'absolute count'),
                   word_dict[word]):
        s += '<tr><td>{0}</td><td>{1}</td></tr>\n'.format(k,v)
    url = 'http://www.fincd.com/index.php?txtSearch=%s&lang=fi' % urllib2.quote(word[:-1].encode('iso-8859-1'))
    s += extract_word_definition(urllib2.urlopen(url).read())
    s += '</table>'
    display(HTML(s))

interact(show_word_and_translation_html_only,
         n=(0, len(word_dict.keys()) - 1))

word_dict[0]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-112-547a459bb01d> in <module>()
----> 1 word_dict[0]

KeyError: 0

sorted_keys = sorted(word_dict.keys(), key=lambda n:word_dict[n][0])

sorted_keys[:10]

[u'olla ',
 u'ja ',
 u'ei ',
 u'se ',
 u'ett\xe4 ',
 u'joka ',
 u'h\xe4n ',
 u'saada ',
 u'mutta ',
 u't\xe4m\xe4 ']

def show_word_and_translation_html_only_sorted(n):
    word = sorted_keys[n]
    s = '<h3>Word: %s</h3><table>\n' % word
    for k,v in zip(('rank', 'relative count', 'absolute count'),
                   word_dict[word]):
        s += '<tr><td>{0}</td><td>{1}</td></tr>\n'.format(k,v)
    url = 'http://www.fincd.com/index.php?txtSearch=%s&lang=fi' % urllib2.quote(word[:-1].encode('iso-8859-1'))
    s += extract_word_definition(urllib2.urlopen(url).read())
    s += '</table>'
    display(HTML(s))

interact(show_word_and_translation_html_only_sorted,
         n=(0, len(word_dict.keys()) - 1))

interact(show_word_and_translation_html_only_sorted,
         n=(0, 200))

Exploring Finnish words using the interactive IPython HTML widgets

Word: arvokisa

Word: diplomaattinen

Adding a translation from an external website¶

But how to encode unicode strings for use in URLs?¶

Word: enso

Word: toimisto

Word: joukkue

Word: tulla

Conclusions¶

Comments

rank	1561
relative count	4584
absolute count	0.007788
Finnish:	toimisto
English:	board bureau office

rank	180
relative count	31653
absolute count	0.053775
Finnish:	joukkue
English:	team

rank	14
relative count	192327
absolute count	0.326742
Finnish:	tulla
English:	come get grow show up