-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathextract_ling.py
50 lines (36 loc) · 1 KB
/
extract_ling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# -*- coding: utf-8 -*-
"""
@author: Mads Olsgaard, 2014
Released under BSD-3 License
This scripts loads japanese text files and do various statistics on linguistic data
"""
#############
## Imports ##
#############
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
import matplotlib
import re, codecs
matplotlib.rc('font', family='TakaoPGothic')
################
## Load files ##
################
file1 = codecs.open("wikipedia_jp.txt", 'r', 'utf-8').read()
file2 = codecs.open("gutenberg_jp.txt", 'r', 'utf-8').read()
data = file1+file2
print("length of data file: ", len(data))
def plot_data(data, max=100):
c = Counter(data)
print("length of c", len(c))
common = c.most_common(max)
keys = [x[0] for x in common]
values = [x[1] for x in common]
y_pos = range(len(common))
pl.figure()
pl.plot(sorted(values, reverse=True))
pl.xticks(y_pos, keys)
pl.show()
print("distribution of all characters")
plot_data(data)