forgot to set up a git repo, so here it is

2019-03-28 19:42:09 +01:00
commit 1245552e24
4 changed files with 1037386 additions and 0 deletions
@@ -0,0 +1 @@
 *.xml
@@ -0,0 +1,475 @@
 #!/usr/bin/env python3
 ###############################################################################
 #                                   imports                                   #
 ###############################################################################
 import csv
 import re
 import sys
 import xml.etree.cElementTree as ET
 ###############################################################################
 #                                    Regex                                    #
 ###############################################################################
 # Sometimes some words are splitted, this regex re-merge them.
 blanks_between_points = re.compile('(?<=[\!\?\.]) (?=[\!\?\.])')
 # This regex will find groups of ending punctuation (!, ? or .)
 # It will be used to splt text in sentences
 ending_punctuation = re.compile('[\!\?\.]+')
 # This regex checks that end of argument given to this program contains at
 # least one alphanumeric character followed by ".txt" characters. It is use
 # twice:
 # - Argument conformity verification
 # - Resulting file creation
 #   ("txt" characters of argument are replaced by "xml")
 filename_end = re.compile('(?<=\w\.)txt$')
 # This regex is used to replace -, ", ', ;, : & , characters (eventually
 # grouped) by a single space
 intermediate_punctuation = re.compile('[\,\;\:\"\'\-]+')
 # Goal of this regex is to replace groups of whites characters
 # (\n, \r, \t, \s) by single spaces
 whites = re.compile('[\n\r\t ]+')
 # Sometimes some words are splitted, this regex re-merge them
 word_breaks = re.compile('\-[\n\r]+')
 ###############################################################################
 #                               global variables                              #
 ###############################################################################
 french_chars = "abcdefghijklmnopqrstuvwxyzàâæÇçéèêëîïôœùûüÿ"
 # input text to parse
 txt = ""
 # dictionary of words loaded from linguistic dictionary
 dictionary = {}
 wordtype = {
    'N': "noun",
    'V': "verb",
    'P': "pronoun",
    'A': "adjective",
    'D': "determiner",
    'R': "adverb",
    'S': "adposition",
    'C': "conjonction",
    'I': "interjection",
    'X': "residual",
    'F': "ponctuation",
    '?': "unknown"
 }
 nountypes = {'c': "common", 'p': "proper", 'k': "cardinal", '-': "None"}
 verbtypes = {'m': "main", 'a': "auxiliary", '-': "None"}
 verbmood = {
    'i': "indicative",
    's': "subjunctive",
    'm': "imperative",
    'c': "conditional",
    'n': "infinitive",
    'p': "participle",
    '-': "None"
 }
 verbtense = {
    'p': "present",
    'i': "imperfect",
    'f': "future",
    's': "past",
    '-': "None"
 }
 adjtype = {
    'f': "qualifiative",
    'o': "ordinal",
    'k': "cardinal",
    'i': "indefinite",
    's': "possessive",
    '-': "None"
 }
 degree = {'p': "positive", 'c': "comparative", 'n': "negative", '-': "None"}
 pronountype = {
    'p': "personal",
    'd': "demanstrative",
    'i': "indefinite",
    's': "possessive",
    't': "interrogative",
    'r': "relative",
    'x': "reflexive",
    'k': "cardinal",
    '-': "None"
 }
 pronouncase = {
    'n': "nominative",
    'a': "accusative",
    'd': "dative",
    'o': "oblique",
    'g': "genitive",
    '-': "None"
 }
 detertypes = {
    'a': "article",
    'd': "demonstrative",
    's': "possessive",
    'i': "indefinite",
    't': "iter-excl",
    'r': "relative",
    'k': "cardinal",
    '-': "None"
 }
 deternature = {'d': "definite", 'i': "indefinite", '-': "None"}
 advtypes = {'g': "general", 'p': "particle", 'x': "interro-excl", '-': "None"}
 adptypes = {'p': "preposition", 'd': "deictique", '-': "None"}
 conjtypes = {'c': "coordinative", 's': "subortinative", '-': "None"}
 gender = {'m': "masculine", 'f': "feminine", '-': "None"}
 number = {'s': "singular", 'p': "plural", '-': "None"}
 person = {'1': "first", '2': "second", '3': "third", '-': "None"}
 ###############################################################################
 #                              classes definition                             #
 ###############################################################################
 class Word:
    """Any word found in the language's dictionary. It contains its plain
    string representation, an array of GRACE descriptions, an array of its word
    categories, its phonetics and its frequency in the working text.
    """
    def __init__(self, word: str, grammar, phonetics: str):
        """Constructor of the `Word` class. It accepts as arguments a string
        (its plain string representation), an array of strings (GRACE
        description of the word) and its phonetics as string too.
        """
        self.__word = word
        self.__grace = grammar
        self.__category = []
        for x in grammar:
            self.__category.append(x[0])
        self.__phon = phonetics
        self.__frequency = 0
    def __str__(self):
        """Returns stringified `Word`"""
        res = self.__word + "\t/" + self.__phon + "/"
        for elem in self.__grace:
            res += "\n\t" + graceToDebugString(elem)
        return res
    def get_word(self):
        """Returns plain string representation of `Word`"""
        return self.__word[:]
    def get_grace(self):
        """Returns array of GRACE descriptions (string) of `Word`"""
        return self.__grace[:]
    def get_phon(self):
        """Returns phonetics of `Word` as string"""
        return self.__phon[:]
    def increase_frequency(self):
        """Increace by one the frequency of `Word`"""
        self.__frequency += 1
    def get_frequency(self):
        """Get frequency of `Word` as number"""
        return self.__frequency
 ###############################################################################
 #                             functions definition                            #
 ###############################################################################
 def graceToDebugString(grace: str):
    """Turns into a single string a GRACE description
    For debug purposes only
    """
    res = wordtype[grace[0]]
    if grace[0] == 'N':
        # pass
        res += " " + nountypes[grace[1]]
        res += " " + gender[grace[2]]
        res += " " + number[grace[3]]
    elif grace[0] == 'V':
        res += " " + verbtypes[grace[1]]
        res += " " + verbmood[grace[2]]
        res += " " + verbtense[grace[3]]
        res += " " + person[grace[4]]
        res += " " + number[grace[5]]
        res += " " + gender[grace[6]]
    elif grace[0] == 'P':
        res += " " + pronountype[grace[1]]
        res += " " + person[grace[2]]
        res += " " + gender[grace[3]]
        res += " " + number[grace[4]]
        res += " " + pronouncase[grace[5]]
        res += " " + number[grace[6]]
    elif grace[0] == 'A':
        res += " " + adjtype[grace[1]]
        res += " " + degree[grace[2]]
        res += " " + gender[grace[3]]
        res += " " + number[grace[4]]
    elif grace[0] == 'D':
        res += " " + detertypes[grace[1]]
        res += " " + person[grace[2]]
        res += " " + gender[grace[3]]
        res += " " + number[grace[4]]
        res += " " + number[grace[5]]
        res += " " + deternature[grace[6]]
    elif grace[0] == 'R':
        res += " " + advtypes[grace[1]]
        res += " " + degree[grace[2]]
    elif grace[0] == 'S':
        res += " " + adptypes[grace[1]]
    elif grace[0] == 'C':
        res += " " + conjtypes[grace[1]]
    elif grace[0] == 'I':
        pass
    elif grace[0] == 'X':
        pass
    elif grace[0] == 'F':
        pass
    elif grace[0] == '?':
        pass
    return res
 def graceToString(grace: str):
    """Returns an array of strings of explicit GRACE description"""
    if grace[0] == 'N':
        return [
            wordtype[grace[0]], nountypes[grace[1]], gender[grace[2]],
            number[grace[3]]
        ]
    if grace[0] == 'V':
        return [
            wordtype[grace[0]], verbtypes[grace[1]], verbmood[grace[2]],
            verbtense[grace[3]], person[grace[4]], number[grace[5]],
            gender[grace[6]]
        ]
    if grace[0] == 'P':
        return [
            wordtype[grace[0]], pronountype[grace[1]], person[grace[2]],
            gender[grace[3]], number[grace[4]], pronouncase[grace[5]],
            number[grace[6]]
        ]
    if grace[0] == 'A':
        return [
            wordtype[grace[0]], adjtype[grace[1]], degree[grace[2]],
            gender[grace[3]], number[grace[4]]
        ]
    if grace[0] == 'D':
        return [
            wordtype[grace[0]], detertypes[grace[1]], person[grace[2]],
            gender[grace[3]], number[grace[4]], number[grace[5]],
            deternature[grace[6]]
        ]
    if grace[0] == 'R':
        return [wordtype[grace[0]], advtypes[grace[1]], degree[grace[2]]]
    if grace[0] == 'S':
        return [wordtype[grace[0]], adptypes[grace[1]]]
    if grace[0] == 'C':
        return [wordtype[grace[0]], conjtypes[grace[1]]]
    return [wordtype[grace[0]]]
 def init_dictionary():
    """Loads the language's dictionary, its format being a comma-separated CSV,
    in its first collumn the word itself, in the second semicolon-separated
    GRACE definition of the word, and in the third collumn the word’s X-SAMPA
    phonetics
    """
    with open('dico.dictfr', newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in spamreader:
            grammar = row[1]
            if grammar.find(';') != -1:
                grammar = grammar.split(';')
            else:
                grammar = [grammar]
            dictionary[row[0]] = Word(row[0], grammar, row[2])
 def clean_txt(txt):
    """This function cleans string given as argument"""
    # Sometimes some words are splitted, this regex re-merge them
    txt = word_breaks.sub('', txt)
    # All of :;,'"- characters will be replaced by a space
    txt = intermediate_punctuation.sub(' ', txt)
    # All whites characters groups (\n, \r, \t & \s) will be replaced by a
    # single space
    txt = whites.sub(' ', txt)
    # All whites characters between ending points (!, ? or .) will be removed
    txt = blanks_between_points.sub('', txt)
    return txt
 def filter_text(txt):
    """This function constructs a list of filtered sentences. Each filtered
    sentence is itself a list of strings. Each string is a word found in the
    dictionary. In the same time a dict is built, containing selected words as
    keys and their frequencies as values."""
    filtered_text = []
    for sentence in filter(None, ending_punctuation.split(clean_txt(txt))):
        word_list = []
        for word in filter(None, sentence.split()):
            lower_word = word.lower()
            if lower_word not in dictionary:
                dictionary[lower_word] = Word(lower_word, ["?"], "")
            word_list.append(lower_word)
            dictionary[lower_word].increase_frequency()
        if word_list != []:
            filtered_text.append(word_list)
    return filtered_text
 def prepare_output(splitted_txt):
    text = ET.Element("text")
    for elem_sentence in splitted_txt:
        sentence = ET.SubElement(text, "sentence")
        for elem_word in elem_sentence:
            word = dictionary[elem_word]
            # First GRACE description used
            grace = word.get_grace()[0]
            grace_details = graceToString(grace)
            try:
                if grace[0] == 'N':
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=grace_details[0],
                        wordtype=grace_details[1],
                        gender=grace_details[2],
                        number=grace_details[3]).text = word.get_word()
                elif grace[0] == 'V':
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=grace_details[0],
                        wordtype=grace_details[1],
                        mood=grace_details[2],
                        tense=grace_details[3],
                        person=grace_details[4],
                        number=grace_details[5],
                        gender=grace_details[6]).text = word.get_word()
                elif grace[0] == 'P':
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=grace_details[0],
                        wordtype=grace_details[1],
                        person=grace_details[2],
                        gender=grace_details[3],
                        number=grace_details[4],
                        case=grace_details[5],
                        possessor=grace_details[6]).text = word.get_word()
                elif grace[0] == 'A':
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=grace_details[0],
                        wordtype=grace_details[1],
                        degree=grace_details[2],
                        gender=grace_details[3],
                        number=grace_details[4]).text = word.get_word()
                elif grace[0] == 'D':
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=grace_details[0],
                        wordtype=grace_details[1],
                        person=grace_details[2],
                        gender=grace_details[3],
                        number=grace_details[4],
                        possessor=grace_details[5],
                        nature=grace_details[6]).text = word.get_word()
                elif grace[0] == 'R':
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=grace_details[0],
                        wordtype=grace_details[1],
                        degree=grace_details[2]).text = word.get_word()
                elif grace[0] == 'S' or grace[0] == 'C':
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=grace_details[0],
                        wordtype=grace_details[1]).text = word.get_word()
                else:
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=str(
                            grace_details[0])).text = word.get_word()
            except IndexError:
                print("Failed on " + word.get_word() + "\tGRACE:" + grace)
    return text
 ###############################################################################
 #                                 main program                                #
 ###############################################################################
 if __name__ == "__main__":
    # arg parse ###################################################################
    if len(sys.argv) < 2 or filename_end.findall(sys.argv[1]) == []:
        exit("Usage: ./parser.py *.txt")
    # load files ##################################################################
    print("Loading input file...", end='')
    with open(sys.argv[1], 'r') as txt_file:
        txt = txt_file.read()
    print("\rInput file loaded      ")
    print("Loading dictionary...", end='')
    init_dictionary()
    print("\r" + str(len(dictionary)) + " words loaded")
    # now comes the parsing work ##################################################
    filtered_text = filter_text(txt)
    root = prepare_output(filtered_text)
    tree = ET.ElementTree(root)
    tree.write(
        filename_end.sub('xml', sys.argv[1]),
        xml_declaration=True,
        encoding='utf-8',
        method="xml")
    exit(0)