Sometimes may be useful to detect a text language, NIX systems usually have a
directory containing the mostly used words of various languages, simply
comparing the numbers yielded from checking which words of the text appear
there seem* to give a significative number regarding which language the text
is written in.
For example, checking this aproximation against the translated text (in Markdown,
before being converted to HTML) of the "entrevista de Snowden para Der Spiegel"
shows the following results:
| $ python lang.py entrevista-de-edward-snowden-para-der-spiegel.txt
entrevista-de-edward-snowden-para-der-spiegel.txt [2341]
58.61% spanish
51.60% galician
29.18% american-english
29.09% british-english
|
Doing the same with the GPLv3 text (in english) results in:
| $ python lang.py LICENSE
LICENSE [5251]
98.36% american-english
98.15% british-english
12.82% spanish
10.68% galician
|
While the aproximations is very plain, the results seem acceptable.
The script used to perform this text was this, is written to use the
"american-english", "british-english", "galician" and "spanish" dictionaries
but any may be used, obviously:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85 | #!/usr/bin/env python
import re
import os
import sys
# Defines the color to show the results in resultados
color = True
if color:
colorfile = "\x1b[0;92m" # Green
colorend = "\x1b[0m" # End of colored output
else:
colorfile = colorend = ""
# Dictionaries to check
dictionaries = ["american-english", "british-english", "galician", "spanish"]
path = "/usr/share/dict"
class LangReader:
words = re.compile(r'\w+')
def __init__(self, dicts):
'''Loads the dictionaries with the words, one per line, lowercased.'''
self.dicts = {}
for d in dicts:
self.dicts[d] = set(map(lambda x: x.strip().lower(),
open(d, "rt").read().strip().split("\n")))
def get_file_props(self, f):
return self.get_props(f.read())
def get_percents(self, data):
'''Calculates the percentage of words for every language.'''
props, total = self.get_props(data)
percents = {}
for lang in props:
percents[lang] = round((float(props[lang]) / total) * 100, 2)
return percents
def get_props(self, data):
'''Returns the number of matchig words for every language.'''
counters = {}
total = 0
words = map(lambda w: w.lower(), self.words.findall(data))
for lang in self.dicts:
ldict = self.dicts[lang]
counter = 0
for word in words:
if word in ldict:
counter += 1
counters[lang] = counter
return counters, len(words)
if __name__ == "__main__":
if len(sys.argv) == 1:
print >>sys.stderr, "%s <file>" % sys.argv[0]
exit(0)
files = map(lambda x: open(x, "rt"), sys.argv[1:])
os.chdir(path)
ld = LangReader(dictionaries)
first = True
for f in files:
if not first:
print "\n"
props, total = ld.get_file_props(f)
if total > 0:
print "%s%s%s [%i]\n" % (colorfile, f.name, colorend, total)
for i in sorted(props,
lambda x, y: props[x].__cmp__(props[y]),
reverse = True):
print "%5.2f%% %s" % ((props[i] / float(total)) * 100, i)
else:
print "%s%s%s nothing found" % (colorfile, f.name, colorend)
first = False
|