Saltar ó contido principal

Detecting a texts language

Sometimes may be useful to detect a text language, *NIX systems usually have a directory containing the mostly used words of various languages, simply comparing the numbers yielded from checking which words of the text appear there seem to give a significative number regarding which language the text is written in.

For example, checking this aproximation against the translated text (in Markdown, before being converted to HTML) of the "entrevista de Snowden para Der Spiegel" shows the following results:

1
2
3
4
5
6
7
$ python lang.py entrevista-de-edward-snowden-para-der-spiegel.txt
entrevista-de-edward-snowden-para-der-spiegel.txt [2341]

58.61%  spanish
51.60%  galician
29.18%  american-english
29.09%  british-english

Doing the same with the GPLv3 text (in english) results in:

1
2
3
4
5
6
7
$ python lang.py LICENSE
LICENSE [5251]

98.36%  american-english
98.15%  british-english
12.82%  spanish
10.68%  galician

While the aproximations is very plain, the results seem acceptable.

The script used to perform this text was this, is written to use the "american-english", "british-english", "galician" and "spanish" dictionaries but any may be used, obviously:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python

import re
import os
import sys

# Defines the color to show the results in resultados
color = True
if color:
    colorfile = "\x1b[0;92m" # Green
    colorend = "\x1b[0m"     # End of colored output
else:
    colorfile = colorend = ""

# Dictionaries to check
dictionaries = ["american-english", "british-english", "galician", "spanish"]
path = "/usr/share/dict"

class LangReader:
    words = re.compile(r'\w+')

    def __init__(self, dicts):
        '''Loads the dictionaries with the words, one per line, lowercased.'''
        self.dicts = {}
        for d in dicts:
            self.dicts[d] = set(map(lambda x: x.strip().lower(),
                                       open(d, "rt").read().strip().split("\n")))

    def get_file_props(self, f):
        return self.get_props(f.read())

    def get_percents(self, data):
        '''Calculates the percentage of words for every language.'''
        props, total = self.get_props(data)
        percents = {}
        for lang in props:
            percents[lang] = round((float(props[lang]) / total) * 100, 2)
        return percents


    def get_props(self, data):
        '''Returns the number of matchig words for every language.'''
        counters = {}
        total = 0
        words = map(lambda w: w.lower(), self.words.findall(data))

        for lang in self.dicts:
            ldict = self.dicts[lang]
            counter = 0
            for word in words:
                if word in ldict:
                    counter += 1

            counters[lang] = counter

        return counters, len(words)


if __name__ == "__main__":
    if len(sys.argv) == 1:
        print >>sys.stderr, "%s <file>" % sys.argv[0]
        exit(0)

    files = map(lambda x: open(x, "rt"), sys.argv[1:])
    os.chdir(path)

    ld = LangReader(dictionaries)
    first = True
    for f in files:

        if not first:
            print "\n"

        props, total = ld.get_file_props(f)
        if total > 0:
            print "%s%s%s [%i]\n" % (colorfile, f.name, colorend, total)
            for i in sorted(props,
                            lambda x, y: props[x].__cmp__(props[y]),
                            reverse = True):

                print "%5.2f%%  %s" % ((props[i] / float(total)) * 100, i)

        else:
            print "%s%s%s nothing found" % (colorfile, f.name, colorend)
        first = False