forgot to set up a git repo, so here it is

2019-03-28 19:42:09 +01:00
commit 1245552e24
4 changed files with 1037386 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+*.xml
--- a/dico.dictfr
+++ b/dico.dictfr
--- a/les_miserables_tomeI.txt
+++ b/les_miserables_tomeI.txt
--- a/parser.py
+++ b/parser.py
@@ -0,0 +1,475 @@
+#!/usr/bin/env python3
+
+###############################################################################
+#                                   imports                                   #
+###############################################################################
+import csv
+import re
+import sys
+import xml.etree.cElementTree as ET
+
+###############################################################################
+#                                    Regex                                    #
+###############################################################################
+
+# Sometimes some words are splitted, this regex re-merge them.
+blanks_between_points = re.compile('(?<=[\!\?\.]) (?=[\!\?\.])')
+
+# This regex will find groups of ending punctuation (!, ? or .)
+# It will be used to splt text in sentences
+ending_punctuation = re.compile('[\!\?\.]+')
+
+# This regex checks that end of argument given to this program contains at
+# least one alphanumeric character followed by ".txt" characters. It is use
+# twice:
+# - Argument conformity verification
+# - Resulting file creation
+#   ("txt" characters of argument are replaced by "xml")
+filename_end = re.compile('(?<=\w\.)txt$')
+
+# This regex is used to replace -, ", ', ;, : & , characters (eventually
+# grouped) by a single space
+intermediate_punctuation = re.compile('[\,\;\:\"\'\-]+')
+
+# Goal of this regex is to replace groups of whites characters
+# (\n, \r, \t, \s) by single spaces
+whites = re.compile('[\n\r\t ]+')
+
+# Sometimes some words are splitted, this regex re-merge them
+word_breaks = re.compile('\-[\n\r]+')
+
+###############################################################################
+#                               global variables                              #
+###############################################################################
+
+french_chars = "abcdefghijklmnopqrstuvwxyzàâæÇçéèêëîïôœùûüÿ"
+
+# input text to parse
+txt = ""
+
+# dictionary of words loaded from linguistic dictionary
+dictionary = {}
+
+wordtype = {
+    'N': "noun",
+    'V': "verb",
+    'P': "pronoun",
+    'A': "adjective",
+    'D': "determiner",
+    'R': "adverb",
+    'S': "adposition",
+    'C': "conjonction",
+    'I': "interjection",
+    'X': "residual",
+    'F': "ponctuation",
+    '?': "unknown"
+}
+
+nountypes = {'c': "common", 'p': "proper", 'k': "cardinal", '-': "None"}
+
+verbtypes = {'m': "main", 'a': "auxiliary", '-': "None"}
+
+verbmood = {
+    'i': "indicative",
+    's': "subjunctive",
+    'm': "imperative",
+    'c': "conditional",
+    'n': "infinitive",
+    'p': "participle",
+    '-': "None"
+}
+
+verbtense = {
+    'p': "present",
+    'i': "imperfect",
+    'f': "future",
+    's': "past",
+    '-': "None"
+}
+
+adjtype = {
+    'f': "qualifiative",
+    'o': "ordinal",
+    'k': "cardinal",
+    'i': "indefinite",
+    's': "possessive",
+    '-': "None"
+}
+
+degree = {'p': "positive", 'c': "comparative", 'n': "negative", '-': "None"}
+
+pronountype = {
+    'p': "personal",
+    'd': "demanstrative",
+    'i': "indefinite",
+    's': "possessive",
+    't': "interrogative",
+    'r': "relative",
+    'x': "reflexive",
+    'k': "cardinal",
+    '-': "None"
+}
+
+pronouncase = {
+    'n': "nominative",
+    'a': "accusative",
+    'd': "dative",
+    'o': "oblique",
+    'g': "genitive",
+    '-': "None"
+}
+
+detertypes = {
+    'a': "article",
+    'd': "demonstrative",
+    's': "possessive",
+    'i': "indefinite",
+    't': "iter-excl",
+    'r': "relative",
+    'k': "cardinal",
+    '-': "None"
+}
+
+deternature = {'d': "definite", 'i': "indefinite", '-': "None"}
+
+advtypes = {'g': "general", 'p': "particle", 'x': "interro-excl", '-': "None"}
+
+adptypes = {'p': "preposition", 'd': "deictique", '-': "None"}
+
+conjtypes = {'c': "coordinative", 's': "subortinative", '-': "None"}
+
+gender = {'m': "masculine", 'f': "feminine", '-': "None"}
+
+number = {'s': "singular", 'p': "plural", '-': "None"}
+
+person = {'1': "first", '2': "second", '3': "third", '-': "None"}
+
+###############################################################################
+#                              classes definition                             #
+###############################################################################
+
+
+class Word:
+    """Any word found in the language's dictionary. It contains its plain
+    string representation, an array of GRACE descriptions, an array of its word
+    categories, its phonetics and its frequency in the working text.
+    """
+
+    def __init__(self, word: str, grammar, phonetics: str):
+        """Constructor of the `Word` class. It accepts as arguments a string
+        (its plain string representation), an array of strings (GRACE
+        description of the word) and its phonetics as string too.
+        """
+        self.__word = word
+        self.__grace = grammar
+        self.__category = []
+        for x in grammar:
+            self.__category.append(x[0])
+        self.__phon = phonetics
+        self.__frequency = 0
+
+    def __str__(self):
+        """Returns stringified `Word`"""
+        res = self.__word + "\t/" + self.__phon + "/"
+        for elem in self.__grace:
+            res += "\n\t" + graceToDebugString(elem)
+        return res
+
+    def get_word(self):
+        """Returns plain string representation of `Word`"""
+        return self.__word[:]
+
+    def get_grace(self):
+        """Returns array of GRACE descriptions (string) of `Word`"""
+        return self.__grace[:]
+
+    def get_phon(self):
+        """Returns phonetics of `Word` as string"""
+        return self.__phon[:]
+
+    def increase_frequency(self):
+        """Increace by one the frequency of `Word`"""
+        self.__frequency += 1
+
+    def get_frequency(self):
+        """Get frequency of `Word` as number"""
+        return self.__frequency
+
+
+###############################################################################
+#                             functions definition                            #
+###############################################################################
+
+
+def graceToDebugString(grace: str):
+    """Turns into a single string a GRACE description
+
+    For debug purposes only
+    """
+    res = wordtype[grace[0]]
+    if grace[0] == 'N':
+        # pass
+        res += " " + nountypes[grace[1]]
+        res += " " + gender[grace[2]]
+        res += " " + number[grace[3]]
+    elif grace[0] == 'V':
+        res += " " + verbtypes[grace[1]]
+        res += " " + verbmood[grace[2]]
+        res += " " + verbtense[grace[3]]
+        res += " " + person[grace[4]]
+        res += " " + number[grace[5]]
+        res += " " + gender[grace[6]]
+    elif grace[0] == 'P':
+        res += " " + pronountype[grace[1]]
+        res += " " + person[grace[2]]
+        res += " " + gender[grace[3]]
+        res += " " + number[grace[4]]
+        res += " " + pronouncase[grace[5]]
+        res += " " + number[grace[6]]
+
+    elif grace[0] == 'A':
+        res += " " + adjtype[grace[1]]
+        res += " " + degree[grace[2]]
+        res += " " + gender[grace[3]]
+        res += " " + number[grace[4]]
+    elif grace[0] == 'D':
+        res += " " + detertypes[grace[1]]
+        res += " " + person[grace[2]]
+        res += " " + gender[grace[3]]
+        res += " " + number[grace[4]]
+        res += " " + number[grace[5]]
+        res += " " + deternature[grace[6]]
+    elif grace[0] == 'R':
+        res += " " + advtypes[grace[1]]
+        res += " " + degree[grace[2]]
+    elif grace[0] == 'S':
+        res += " " + adptypes[grace[1]]
+    elif grace[0] == 'C':
+        res += " " + conjtypes[grace[1]]
+    elif grace[0] == 'I':
+        pass
+    elif grace[0] == 'X':
+        pass
+    elif grace[0] == 'F':
+        pass
+    elif grace[0] == '?':
+        pass
+    return res
+
+
+def graceToString(grace: str):
+    """Returns an array of strings of explicit GRACE description"""
+    if grace[0] == 'N':
+        return [
+            wordtype[grace[0]], nountypes[grace[1]], gender[grace[2]],
+            number[grace[3]]
+        ]
+    if grace[0] == 'V':
+        return [
+            wordtype[grace[0]], verbtypes[grace[1]], verbmood[grace[2]],
+            verbtense[grace[3]], person[grace[4]], number[grace[5]],
+            gender[grace[6]]
+        ]
+    if grace[0] == 'P':
+        return [
+            wordtype[grace[0]], pronountype[grace[1]], person[grace[2]],
+            gender[grace[3]], number[grace[4]], pronouncase[grace[5]],
+            number[grace[6]]
+        ]
+
+    if grace[0] == 'A':
+        return [
+            wordtype[grace[0]], adjtype[grace[1]], degree[grace[2]],
+            gender[grace[3]], number[grace[4]]
+        ]
+    if grace[0] == 'D':
+        return [
+            wordtype[grace[0]], detertypes[grace[1]], person[grace[2]],
+            gender[grace[3]], number[grace[4]], number[grace[5]],
+            deternature[grace[6]]
+        ]
+    if grace[0] == 'R':
+        return [wordtype[grace[0]], advtypes[grace[1]], degree[grace[2]]]
+    if grace[0] == 'S':
+        return [wordtype[grace[0]], adptypes[grace[1]]]
+    if grace[0] == 'C':
+        return [wordtype[grace[0]], conjtypes[grace[1]]]
+    return [wordtype[grace[0]]]
+
+
+def init_dictionary():
+    """Loads the language's dictionary, its format being a comma-separated CSV,
+    in its first collumn the word itself, in the second semicolon-separated
+    GRACE definition of the word, and in the third collumn the word’s X-SAMPA
+    phonetics
+    """
+    with open('dico.dictfr', newline='') as csvfile:
+        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
+        for row in spamreader:
+            grammar = row[1]
+            if grammar.find(';') != -1:
+                grammar = grammar.split(';')
+            else:
+                grammar = [grammar]
+            dictionary[row[0]] = Word(row[0], grammar, row[2])
+
+
+def clean_txt(txt):
+    """This function cleans string given as argument"""
+
+    # Sometimes some words are splitted, this regex re-merge them
+    txt = word_breaks.sub('', txt)
+
+    # All of :;,'"- characters will be replaced by a space
+    txt = intermediate_punctuation.sub(' ', txt)
+
+    # All whites characters groups (\n, \r, \t & \s) will be replaced by a
+    # single space
+    txt = whites.sub(' ', txt)
+
+    # All whites characters between ending points (!, ? or .) will be removed
+    txt = blanks_between_points.sub('', txt)
+
+    return txt
+
+
+def filter_text(txt):
+    """This function constructs a list of filtered sentences. Each filtered
+    sentence is itself a list of strings. Each string is a word found in the
+    dictionary. In the same time a dict is built, containing selected words as
+    keys and their frequencies as values."""
+    filtered_text = []
+    for sentence in filter(None, ending_punctuation.split(clean_txt(txt))):
+        word_list = []
+        for word in filter(None, sentence.split()):
+            lower_word = word.lower()
+            if lower_word not in dictionary:
+                dictionary[lower_word] = Word(lower_word, ["?"], "")
+            word_list.append(lower_word)
+            dictionary[lower_word].increase_frequency()
+        if word_list != []:
+            filtered_text.append(word_list)
+    return filtered_text
+
+
+def prepare_output(splitted_txt):
+    text = ET.Element("text")
+    for elem_sentence in splitted_txt:
+        sentence = ET.SubElement(text, "sentence")
+        for elem_word in elem_sentence:
+            word = dictionary[elem_word]
+            # First GRACE description used
+            grace = word.get_grace()[0]
+            grace_details = graceToString(grace)
+            try:
+                if grace[0] == 'N':
+                    ET.SubElement(
+                        sentence,
+                        "word",
+                        frequency=str(word.get_frequency()),
+                        wordclass=grace_details[0],
+                        wordtype=grace_details[1],
+                        gender=grace_details[2],
+                        number=grace_details[3]).text = word.get_word()
+                elif grace[0] == 'V':
+                    ET.SubElement(
+                        sentence,
+                        "word",
+                        frequency=str(word.get_frequency()),
+                        wordclass=grace_details[0],
+                        wordtype=grace_details[1],
+                        mood=grace_details[2],
+                        tense=grace_details[3],
+                        person=grace_details[4],
+                        number=grace_details[5],
+                        gender=grace_details[6]).text = word.get_word()
+                elif grace[0] == 'P':
+                    ET.SubElement(
+                        sentence,
+                        "word",
+                        frequency=str(word.get_frequency()),
+                        wordclass=grace_details[0],
+                        wordtype=grace_details[1],
+                        person=grace_details[2],
+                        gender=grace_details[3],
+                        number=grace_details[4],
+                        case=grace_details[5],
+                        possessor=grace_details[6]).text = word.get_word()
+                elif grace[0] == 'A':
+                    ET.SubElement(
+                        sentence,
+                        "word",
+                        frequency=str(word.get_frequency()),
+                        wordclass=grace_details[0],
+                        wordtype=grace_details[1],
+                        degree=grace_details[2],
+                        gender=grace_details[3],
+                        number=grace_details[4]).text = word.get_word()
+                elif grace[0] == 'D':
+                    ET.SubElement(
+                        sentence,
+                        "word",
+                        frequency=str(word.get_frequency()),
+                        wordclass=grace_details[0],
+                        wordtype=grace_details[1],
+                        person=grace_details[2],
+                        gender=grace_details[3],
+                        number=grace_details[4],
+                        possessor=grace_details[5],
+                        nature=grace_details[6]).text = word.get_word()
+                elif grace[0] == 'R':
+                    ET.SubElement(
+                        sentence,
+                        "word",
+                        frequency=str(word.get_frequency()),
+                        wordclass=grace_details[0],
+                        wordtype=grace_details[1],
+                        degree=grace_details[2]).text = word.get_word()
+                elif grace[0] == 'S' or grace[0] == 'C':
+                    ET.SubElement(
+                        sentence,
+                        "word",
+                        frequency=str(word.get_frequency()),
+                        wordclass=grace_details[0],
+                        wordtype=grace_details[1]).text = word.get_word()
+                else:
+                    ET.SubElement(
+                        sentence,
+                        "word",
+                        frequency=str(word.get_frequency()),
+                        wordclass=str(
+                            grace_details[0])).text = word.get_word()
+            except IndexError:
+                print("Failed on " + word.get_word() + "\tGRACE:" + grace)
+    return text
+
+
+###############################################################################
+#                                 main program                                #
+###############################################################################
+
+if __name__ == "__main__":
+    # arg parse ###################################################################
+    if len(sys.argv) < 2 or filename_end.findall(sys.argv[1]) == []:
+        exit("Usage: ./parser.py *.txt")
+
+    # load files ##################################################################
+    print("Loading input file...", end='')
+    with open(sys.argv[1], 'r') as txt_file:
+        txt = txt_file.read()
+    print("\rInput file loaded      ")
+    print("Loading dictionary...", end='')
+    init_dictionary()
+    print("\r" + str(len(dictionary)) + " words loaded")
+
+    # now comes the parsing work ##################################################
+
+    filtered_text = filter_text(txt)
+    root = prepare_output(filtered_text)
+    tree = ET.ElementTree(root)
+    tree.write(
+        filename_end.sub('xml', sys.argv[1]),
+        xml_declaration=True,
+        encoding='utf-8',
+        method="xml")
+    exit(0)