#!/usr/bin/env python3 # -*- coding: utf-8 -*- ############################################################################### # imports # ############################################################################### import csv import re import sys import xml.etree.cElementTree as ET ############################################################################### # Regex # ############################################################################### # Sometimes some words are splitted, this regex re-merge them. blanks_between_points = re.compile('(?<=[\!\?\.]) (?=[\!\?\.])') # This regex will find groups of ending punctuation (!, ? or .) # It will be used to splt text in sentences ending_punctuation = re.compile('[\!\?\.]+') # This regex checks that end of argument given to this program contains at # least one alphanumeric character followed by ".txt" characters. It is use # twice: # - Argument conformity verification # - Resulting file creation # ("txt" characters of argument are replaced by "xml") filename_end = re.compile('(?<=\w\.)txt$') # This regex is used to replace -, ", ', ;, : & , characters (eventually # grouped) by a single space intermediate_punctuation = re.compile('[\,\;\:\"\'\-]+') # Goal of this regex is to replace groups of whites characters # (\n, \r, \t, \s) by single spaces whites = re.compile('[\n\r\t ]+') # Sometimes some words are splitted, this regex re-merge them word_breaks = re.compile('\-[\n\r]+') ############################################################################### # global variables # ############################################################################### french_chars = "abcdefghijklmnopqrstuvwxyzàâæÇçéèêëîïôœùûüÿ" # input text to parse txt = "" # dictionary of words loaded from linguistic dictionary dictionary = {} wordtype = { 'N': "noun", 'V': "verb", 'P': "pronoun", 'A': "adjective", 'D': "determiner", 'R': "adverb", 'S': "adposition", 'C': "conjonction", 'I': "interjection", 'X': "residual", 'F': "ponctuation", '?': "unknown" } nountypes = {'c': "common", 'p': "proper", 'k': "cardinal", '-': "None"} verbtypes = {'m': "main", 'a': "auxiliary", '-': "None"} verbmood = { 'i': "indicative", 's': "subjunctive", 'm': "imperative", 'c': "conditional", 'n': "infinitive", 'p': "participle", '-': "None" } verbtense = { 'p': "present", 'i': "imperfect", 'f': "future", 's': "past", '-': "None" } adjtype = { 'f': "qualifiative", 'o': "ordinal", 'k': "cardinal", 'i': "indefinite", 's': "possessive", '-': "None" } degree = {'p': "positive", 'c': "comparative", 'n': "negative", '-': "None"} pronountype = { 'p': "personal", 'd': "demanstrative", 'i': "indefinite", 's': "possessive", 't': "interrogative", 'r': "relative", 'x': "reflexive", 'k': "cardinal", '-': "None" } pronouncase = { 'n': "nominative", 'a': "accusative", 'd': "dative", 'o': "oblique", 'g': "genitive", '-': "None" } detertypes = { 'a': "article", 'd': "demonstrative", 's': "possessive", 'i': "indefinite", 't': "iter-excl", 'r': "relative", 'k': "cardinal", '-': "None" } deternature = {'d': "definite", 'i': "indefinite", '-': "None"} advtypes = {'g': "general", 'p': "particle", 'x': "interro-excl", '-': "None"} adptypes = {'p': "preposition", 'd': "deictique", '-': "None"} conjtypes = {'c': "coordinative", 's': "subortinative", '-': "None"} gender = {'m': "masculine", 'f': "feminine", '-': "None"} number = {'s': "singular", 'p': "plural", '-': "None"} person = {'1': "first", '2': "second", '3': "third", '-': "None"} ############################################################################### # classes definition # ############################################################################### class Word: """Any word found in the language's dictionary. It contains its plain string representation, an array of GRACE descriptions, an array of its word categories, its phonetics and its frequency in the working text. """ def __init__(self, word: str, grammar, phonetics: str): """Constructor of the `Word` class. It accepts as arguments a string (its plain string representation), an array of strings (GRACE description of the word) and its phonetics as string too. """ self.__word = word self.__grace = grammar self.__category = [] for x in grammar: self.__category.append(x[0]) self.__phon = phonetics self.__frequency = 0 def __str__(self): """Returns stringified `Word`""" res = self.__word + "\t/" + self.__phon + "/" for elem in self.__grace: res += "\n\t" + graceToDebugString(elem) return res def get_word(self): """Returns plain string representation of `Word`""" return self.__word[:] def get_grace(self): """Returns array of GRACE descriptions (string) of `Word`""" return self.__grace[:] def get_phon(self): """Returns phonetics of `Word` as string""" return self.__phon[:] def increase_frequency(self): """Increace by one the frequency of `Word`""" self.__frequency += 1 def get_frequency(self): """Get frequency of `Word` as number""" return self.__frequency ############################################################################### # functions definition # ############################################################################### def graceToDebugString(grace: str): """Turns into a single string a GRACE description For debug purposes only """ res = wordtype[grace[0]] if grace[0] == 'N': # pass res += " " + nountypes[grace[1]] res += " " + gender[grace[2]] res += " " + number[grace[3]] elif grace[0] == 'V': res += " " + verbtypes[grace[1]] res += " " + verbmood[grace[2]] res += " " + verbtense[grace[3]] res += " " + person[grace[4]] res += " " + number[grace[5]] res += " " + gender[grace[6]] elif grace[0] == 'P': res += " " + pronountype[grace[1]] res += " " + person[grace[2]] res += " " + gender[grace[3]] res += " " + number[grace[4]] res += " " + pronouncase[grace[5]] res += " " + number[grace[6]] elif grace[0] == 'A': res += " " + adjtype[grace[1]] res += " " + degree[grace[2]] res += " " + gender[grace[3]] res += " " + number[grace[4]] elif grace[0] == 'D': res += " " + detertypes[grace[1]] res += " " + person[grace[2]] res += " " + gender[grace[3]] res += " " + number[grace[4]] res += " " + number[grace[5]] res += " " + deternature[grace[6]] elif grace[0] == 'R': res += " " + advtypes[grace[1]] res += " " + degree[grace[2]] elif grace[0] == 'S': res += " " + adptypes[grace[1]] elif grace[0] == 'C': res += " " + conjtypes[grace[1]] elif grace[0] == 'I': pass elif grace[0] == 'X': pass elif grace[0] == 'F': pass elif grace[0] == '?': pass return res def graceToString(grace: str): """Returns an array of strings of explicit GRACE description""" if grace[0] == 'N': return [ wordtype[grace[0]], nountypes[grace[1]], gender[grace[2]], number[grace[3]] ] if grace[0] == 'V': return [ wordtype[grace[0]], verbtypes[grace[1]], verbmood[grace[2]], verbtense[grace[3]], person[grace[4]], number[grace[5]], gender[grace[6]] ] if grace[0] == 'P': return [ wordtype[grace[0]], pronountype[grace[1]], person[grace[2]], gender[grace[3]], number[grace[4]], pronouncase[grace[5]], number[grace[6]] ] if grace[0] == 'A': return [ wordtype[grace[0]], adjtype[grace[1]], degree[grace[2]], gender[grace[3]], number[grace[4]] ] if grace[0] == 'D': return [ wordtype[grace[0]], detertypes[grace[1]], person[grace[2]], gender[grace[3]], number[grace[4]], number[grace[5]], deternature[grace[6]] ] if grace[0] == 'R': return [wordtype[grace[0]], advtypes[grace[1]], degree[grace[2]]] if grace[0] == 'S': return [wordtype[grace[0]], adptypes[grace[1]]] if grace[0] == 'C': return [wordtype[grace[0]], conjtypes[grace[1]]] return [wordtype[grace[0]]] def init_dictionary(): """Loads the language's dictionary, its format being a comma-separated CSV, in its first collumn the word itself, in the second semicolon-separated GRACE definition of the word, and in the third collumn the word’s X-SAMPA phonetics """ with open('dico.dictfr', newline='') as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') for row in spamreader: grammar = row[1] if grammar.find(';') != -1: grammar = grammar.split(';') else: grammar = [grammar] dictionary[row[0]] = Word(row[0], grammar, row[2]) def clean_txt(txt): """This function cleans string given as argument""" # Sometimes some words are splitted, this regex re-merge them txt = word_breaks.sub('', txt) # All of :;,'"- characters will be replaced by a space txt = intermediate_punctuation.sub(' ', txt) # All whites characters groups (\n, \r, \t & \s) will be replaced by a # single space txt = whites.sub(' ', txt) # All whites characters between ending points (!, ? or .) will be removed txt = blanks_between_points.sub('', txt) return txt def filter_text(txt): """This function constructs a list of filtered sentences. Each filtered sentence is itself a list of strings. Each string is a word found in the dictionary. In the same time a dict is built, containing selected words as keys and their frequencies as values.""" filtered_text = [] for sentence in filter(None, ending_punctuation.split(clean_txt(txt))): word_list = [] for word in filter(None, sentence.split()): lower_word = word.lower() if lower_word not in dictionary: dictionary[lower_word] = Word(lower_word, ["?"], "") word_list.append(lower_word) dictionary[lower_word].increase_frequency() if word_list != []: filtered_text.append(word_list) return filtered_text def prepare_output(splitted_txt): text = ET.Element("text") for elem_sentence in splitted_txt: sentence = ET.SubElement(text, "sentence") for elem_word in elem_sentence: word = dictionary[elem_word] # First GRACE description used grace = word.get_grace()[0] grace_details = graceToString(grace) try: if grace[0] == 'N': ET.SubElement( sentence, "word", frequency=str(word.get_frequency()), wordclass=grace_details[0], wordtype=grace_details[1], gender=grace_details[2], number=grace_details[3]).text = word.get_word() elif grace[0] == 'V': ET.SubElement( sentence, "word", frequency=str(word.get_frequency()), wordclass=grace_details[0], wordtype=grace_details[1], mood=grace_details[2], tense=grace_details[3], person=grace_details[4], number=grace_details[5], gender=grace_details[6]).text = word.get_word() elif grace[0] == 'P': ET.SubElement( sentence, "word", frequency=str(word.get_frequency()), wordclass=grace_details[0], wordtype=grace_details[1], person=grace_details[2], gender=grace_details[3], number=grace_details[4], case=grace_details[5], possessor=grace_details[6]).text = word.get_word() elif grace[0] == 'A': ET.SubElement( sentence, "word", frequency=str(word.get_frequency()), wordclass=grace_details[0], wordtype=grace_details[1], degree=grace_details[2], gender=grace_details[3], number=grace_details[4]).text = word.get_word() elif grace[0] == 'D': ET.SubElement( sentence, "word", frequency=str(word.get_frequency()), wordclass=grace_details[0], wordtype=grace_details[1], person=grace_details[2], gender=grace_details[3], number=grace_details[4], possessor=grace_details[5], nature=grace_details[6]).text = word.get_word() elif grace[0] == 'R': ET.SubElement( sentence, "word", frequency=str(word.get_frequency()), wordclass=grace_details[0], wordtype=grace_details[1], degree=grace_details[2]).text = word.get_word() elif grace[0] == 'S' or grace[0] == 'C': ET.SubElement( sentence, "word", frequency=str(word.get_frequency()), wordclass=grace_details[0], wordtype=grace_details[1]).text = word.get_word() else: ET.SubElement( sentence, "word", frequency=str(word.get_frequency()), wordclass=str( grace_details[0])).text = word.get_word() except IndexError: print("Failed on " + word.get_word() + "\tGRACE:" + grace) return text ############################################################################### # main program # ############################################################################### if __name__ == "__main__": # arg parse ################################################################### if len(sys.argv) < 2 or filename_end.findall(sys.argv[1]) == []: exit("Usage: ./parser.py *.txt") # load files ################################################################## print("Loading input file...", end='') with open(sys.argv[1], 'r') as txt_file: txt = txt_file.read() print("\rInput file loaded ") print("Loading dictionary...", end='') init_dictionary() print("\r" + str(len(dictionary)) + " words loaded") # now comes the parsing work ################################################## filtered_text = filter_text(txt) root = prepare_output(filtered_text) tree = ET.ElementTree(root) tree.write( filename_end.sub('xml', sys.argv[1]), xml_declaration=True, encoding='utf-8', method="xml") exit(0)