detailed-french-parser/parser.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

###############################################################################
#                                   imports                                   #
###############################################################################
import csv
import re
import sys
import xml.etree.cElementTree as ET

###############################################################################
#                                    Regex                                    #
###############################################################################

# Sometimes some words are splitted, this regex re-merge them.
blanks_between_points = re.compile('(?<=[\!\?\.]) (?=[\!\?\.])')

# This regex will find groups of ending punctuation (!, ? or .)
# It will be used to splt text in sentences
ending_punctuation = re.compile('[\!\?\.]+')

# This regex checks that end of argument given to this program contains at
# least one alphanumeric character followed by ".txt" characters. It is use
# twice:
# - Argument conformity verification
# - Resulting file creation
#   ("txt" characters of argument are replaced by "xml")
filename_end = re.compile('(?<=\w\.)txt$')

# This regex is used to replace -, ", ', ;, : & , characters (eventually
# grouped) by a single space
intermediate_punctuation = re.compile('[\,\;\:\"\'\-]+')

# Goal of this regex is to replace groups of whites characters
# (\n, \r, \t, \s) by single spaces
whites = re.compile('[\n\r\t ]+')

# Sometimes some words are splitted, this regex re-merge them
word_breaks = re.compile('\-[\n\r]+')

###############################################################################
#                               global variables                              #
###############################################################################

french_chars = "abcdefghijklmnopqrstuvwxyzàâæÇçéèêëîïôœùûüÿ"

# input text to parse
txt = ""

# dictionary of words loaded from linguistic dictionary
dictionary = {}

wordtype = {
    'N': "noun",
    'V': "verb",
    'P': "pronoun",
    'A': "adjective",
    'D': "determiner",
    'R': "adverb",
    'S': "adposition",
    'C': "conjonction",
    'I': "interjection",
    'X': "residual",
    'F': "ponctuation",
    '?': "unknown"
}

nountypes = {'c': "common", 'p': "proper", 'k': "cardinal", '-': "None"}

verbtypes = {'m': "main", 'a': "auxiliary", '-': "None"}

verbmood = {
    'i': "indicative",
    's': "subjunctive",
    'm': "imperative",
    'c': "conditional",
    'n': "infinitive",
    'p': "participle",
    '-': "None"
}

verbtense = {
    'p': "present",
    'i': "imperfect",
    'f': "future",
    's': "past",
    '-': "None"
}

adjtype = {
    'f': "qualifiative",
    'o': "ordinal",
    'k': "cardinal",
    'i': "indefinite",
    's': "possessive",
    '-': "None"
}

degree = {'p': "positive", 'c': "comparative", 'n': "negative", '-': "None"}

pronountype = {
    'p': "personal",
    'd': "demanstrative",
    'i': "indefinite",
    's': "possessive",
    't': "interrogative",
    'r': "relative",
    'x': "reflexive",
    'k': "cardinal",
    '-': "None"
}

pronouncase = {
    'n': "nominative",
    'a': "accusative",
    'd': "dative",
    'o': "oblique",
    'g': "genitive",
    '-': "None"
}

detertypes = {
    'a': "article",
    'd': "demonstrative",
    's': "possessive",
    'i': "indefinite",
    't': "iter-excl",
    'r': "relative",
    'k': "cardinal",
    '-': "None"
}

deternature = {'d': "definite", 'i': "indefinite", '-': "None"}

advtypes = {'g': "general", 'p': "particle", 'x': "interro-excl", '-': "None"}

adptypes = {'p': "preposition", 'd': "deictique", '-': "None"}

conjtypes = {'c': "coordinative", 's': "subortinative", '-': "None"}

gender = {'m': "masculine", 'f': "feminine", '-': "None"}

number = {'s': "singular", 'p': "plural", '-': "None"}

person = {'1': "first", '2': "second", '3': "third", '-': "None"}

###############################################################################
#                              classes definition                             #
###############################################################################


class Word:
    """Any word found in the language's dictionary. It contains its plain
    string representation, an array of GRACE descriptions, an array of its word
    categories, its phonetics and its frequency in the working text.
    """

    def __init__(self, word: str, grammar, phonetics: str):
        """Constructor of the `Word` class. It accepts as arguments a string
        (its plain string representation), an array of strings (GRACE
        description of the word) and its phonetics as string too.
        """
        self.__word = word
        self.__grace = grammar
        self.__category = []
        for x in grammar:
            self.__category.append(x[0])
        self.__phon = phonetics
        self.__frequency = 0

    def __str__(self):
        """Returns stringified `Word`"""
        res = self.__word + "\t/" + self.__phon + "/"
        for elem in self.__grace:
            res += "\n\t" + graceToDebugString(elem)
        return res

    def get_word(self):
        """Returns plain string representation of `Word`"""
        return self.__word[:]

    def get_grace(self):
        """Returns array of GRACE descriptions (string) of `Word`"""
        return self.__grace[:]

    def get_phon(self):
        """Returns phonetics of `Word` as string"""
        return self.__phon[:]

    def increase_frequency(self):
        """Increace by one the frequency of `Word`"""
        self.__frequency += 1

    def get_frequency(self):
        """Get frequency of `Word` as number"""
        return self.__frequency


###############################################################################
#                             functions definition                            #
###############################################################################


def graceToDebugString(grace: str):
    """Turns into a single string a GRACE description

    For debug purposes only
    """
    res = wordtype[grace[0]]
    if grace[0] == 'N':
        # pass
        res += " " + nountypes[grace[1]]
        res += " " + gender[grace[2]]
        res += " " + number[grace[3]]
    elif grace[0] == 'V':
        res += " " + verbtypes[grace[1]]
        res += " " + verbmood[grace[2]]
        res += " " + verbtense[grace[3]]
        res += " " + person[grace[4]]
        res += " " + number[grace[5]]
        res += " " + gender[grace[6]]
    elif grace[0] == 'P':
        res += " " + pronountype[grace[1]]
        res += " " + person[grace[2]]
        res += " " + gender[grace[3]]
        res += " " + number[grace[4]]
        res += " " + pronouncase[grace[5]]
        res += " " + number[grace[6]]

    elif grace[0] == 'A':
        res += " " + adjtype[grace[1]]
        res += " " + degree[grace[2]]
        res += " " + gender[grace[3]]
        res += " " + number[grace[4]]
    elif grace[0] == 'D':
        res += " " + detertypes[grace[1]]
        res += " " + person[grace[2]]
        res += " " + gender[grace[3]]
        res += " " + number[grace[4]]
        res += " " + number[grace[5]]
        res += " " + deternature[grace[6]]
    elif grace[0] == 'R':
        res += " " + advtypes[grace[1]]
        res += " " + degree[grace[2]]
    elif grace[0] == 'S':
        res += " " + adptypes[grace[1]]
    elif grace[0] == 'C':
        res += " " + conjtypes[grace[1]]
    elif grace[0] == 'I':
        pass
    elif grace[0] == 'X':
        pass
    elif grace[0] == 'F':
        pass
    elif grace[0] == '?':
        pass
    return res


def graceToString(grace: str):
    """Returns an array of strings of explicit GRACE description"""
    if grace[0] == 'N':
        return [
            wordtype[grace[0]], nountypes[grace[1]], gender[grace[2]],
            number[grace[3]]
        ]
    if grace[0] == 'V':
        return [
            wordtype[grace[0]], verbtypes[grace[1]], verbmood[grace[2]],
            verbtense[grace[3]], person[grace[4]], number[grace[5]],
            gender[grace[6]]
        ]
    if grace[0] == 'P':
        return [
            wordtype[grace[0]], pronountype[grace[1]], person[grace[2]],
            gender[grace[3]], number[grace[4]], pronouncase[grace[5]],
            number[grace[6]]
        ]

    if grace[0] == 'A':
        return [
            wordtype[grace[0]], adjtype[grace[1]], degree[grace[2]],
            gender[grace[3]], number[grace[4]]
        ]
    if grace[0] == 'D':
        return [
            wordtype[grace[0]], detertypes[grace[1]], person[grace[2]],
            gender[grace[3]], number[grace[4]], number[grace[5]],
            deternature[grace[6]]
        ]
    if grace[0] == 'R':
        return [wordtype[grace[0]], advtypes[grace[1]], degree[grace[2]]]
    if grace[0] == 'S':
        return [wordtype[grace[0]], adptypes[grace[1]]]
    if grace[0] == 'C':
        return [wordtype[grace[0]], conjtypes[grace[1]]]
    return [wordtype[grace[0]]]


def init_dictionary():
    """Loads the language's dictionary, its format being a comma-separated CSV,
    in its first collumn the word itself, in the second semicolon-separated
    GRACE definition of the word, and in the third collumn the word’s X-SAMPA
    phonetics
    """
    with open('dico.dictfr', newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in spamreader:
            grammar = row[1]
            if grammar.find(';') != -1:
                grammar = grammar.split(';')
            else:
                grammar = [grammar]
            dictionary[row[0]] = Word(row[0], grammar, row[2])


def clean_txt(txt):
    """This function cleans string given as argument"""

    # Sometimes some words are splitted, this regex re-merge them
    txt = word_breaks.sub('', txt)

    # All of :;,'"- characters will be replaced by a space
    txt = intermediate_punctuation.sub(' ', txt)

    # All whites characters groups (\n, \r, \t & \s) will be replaced by a
    # single space
    txt = whites.sub(' ', txt)

    # All whites characters between ending points (!, ? or .) will be removed
    txt = blanks_between_points.sub('', txt)

    return txt


def filter_text(txt):
    """This function constructs a list of filtered sentences. Each filtered
    sentence is itself a list of strings. Each string is a word found in the
    dictionary. In the same time a dict is built, containing selected words as
    keys and their frequencies as values."""
    filtered_text = []
    for sentence in filter(None, ending_punctuation.split(clean_txt(txt))):
        word_list = []
        for word in filter(None, sentence.split()):
            lower_word = word.lower()
            if lower_word not in dictionary:
                dictionary[lower_word] = Word(lower_word, ["?"], "")
            word_list.append(lower_word)
            dictionary[lower_word].increase_frequency()
        if word_list != []:
            filtered_text.append(word_list)
    return filtered_text


def prepare_output(splitted_txt):
    text = ET.Element("text")
    for elem_sentence in splitted_txt:
        sentence = ET.SubElement(text, "sentence")
        for elem_word in elem_sentence:
            word = dictionary[elem_word]
            # First GRACE description used
            grace = word.get_grace()[0]
            grace_details = graceToString(grace)
            try:
                if grace[0] == 'N':
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=grace_details[0],
                        wordtype=grace_details[1],
                        gender=grace_details[2],
                        number=grace_details[3]).text = word.get_word()
                elif grace[0] == 'V':
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=grace_details[0],
                        wordtype=grace_details[1],
                        mood=grace_details[2],
                        tense=grace_details[3],
                        person=grace_details[4],
                        number=grace_details[5],
                        gender=grace_details[6]).text = word.get_word()
                elif grace[0] == 'P':
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=grace_details[0],
                        wordtype=grace_details[1],
                        person=grace_details[2],
                        gender=grace_details[3],
                        number=grace_details[4],
                        case=grace_details[5],
                        possessor=grace_details[6]).text = word.get_word()
                elif grace[0] == 'A':
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=grace_details[0],
                        wordtype=grace_details[1],
                        degree=grace_details[2],
                        gender=grace_details[3],
                        number=grace_details[4]).text = word.get_word()
                elif grace[0] == 'D':
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=grace_details[0],
                        wordtype=grace_details[1],
                        person=grace_details[2],
                        gender=grace_details[3],
                        number=grace_details[4],
                        possessor=grace_details[5],
                        nature=grace_details[6]).text = word.get_word()
                elif grace[0] == 'R':
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=grace_details[0],
                        wordtype=grace_details[1],
                        degree=grace_details[2]).text = word.get_word()
                elif grace[0] == 'S' or grace[0] == 'C':
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=grace_details[0],
                        wordtype=grace_details[1]).text = word.get_word()
                else:
                    ET.SubElement(
                        sentence,
                        "word",
                        frequency=str(word.get_frequency()),
                        wordclass=str(
                            grace_details[0])).text = word.get_word()
            except IndexError:
                print("Failed on " + word.get_word() + "\tGRACE:" + grace)
    return text


###############################################################################
#                                 main program                                #
###############################################################################

if __name__ == "__main__":
    # arg parse ###################################################################
    if len(sys.argv) < 2 or filename_end.findall(sys.argv[1]) == []:
        exit("Usage: ./parser.py *.txt")

    # load files ##################################################################
    print("Loading input file...", end='')
    with open(sys.argv[1], 'r') as txt_file:
        txt = txt_file.read()
    print("\rInput file loaded      ")
    print("Loading dictionary...", end='')
    init_dictionary()
    print("\r" + str(len(dictionary)) + " words loaded")

    # now comes the parsing work ##################################################

    filtered_text = filter_text(txt)
    root = prepare_output(filtered_text)
    tree = ET.ElementTree(root)
    tree.write(
        filename_end.sub('xml', sys.argv[1]),
        xml_declaration=True,
        encoding='utf-8',
        method="xml")
    exit(0)