You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
476 lines
16 KiB
476 lines
16 KiB
#!/usr/bin/env python3 |
|
# -*- coding: utf-8 -*- |
|
|
|
############################################################################### |
|
# imports # |
|
############################################################################### |
|
import csv |
|
import re |
|
import sys |
|
import xml.etree.cElementTree as ET |
|
|
|
############################################################################### |
|
# Regex # |
|
############################################################################### |
|
|
|
# Sometimes some words are splitted, this regex re-merge them. |
|
blanks_between_points = re.compile('(?<=[\!\?\.]) (?=[\!\?\.])') |
|
|
|
# This regex will find groups of ending punctuation (!, ? or .) |
|
# It will be used to splt text in sentences |
|
ending_punctuation = re.compile('[\!\?\.]+') |
|
|
|
# This regex checks that end of argument given to this program contains at |
|
# least one alphanumeric character followed by ".txt" characters. It is use |
|
# twice: |
|
# - Argument conformity verification |
|
# - Resulting file creation |
|
# ("txt" characters of argument are replaced by "xml") |
|
filename_end = re.compile('(?<=\w\.)txt$') |
|
|
|
# This regex is used to replace -, ", ', ;, : & , characters (eventually |
|
# grouped) by a single space |
|
intermediate_punctuation = re.compile('[\,\;\:\"\'\-]+') |
|
|
|
# Goal of this regex is to replace groups of whites characters |
|
# (\n, \r, \t, \s) by single spaces |
|
whites = re.compile('[\n\r\t ]+') |
|
|
|
# Sometimes some words are splitted, this regex re-merge them |
|
word_breaks = re.compile('\-[\n\r]+') |
|
|
|
############################################################################### |
|
# global variables # |
|
############################################################################### |
|
|
|
french_chars = "abcdefghijklmnopqrstuvwxyzàâæÇçéèêëîïôœùûüÿ" |
|
|
|
# input text to parse |
|
txt = "" |
|
|
|
# dictionary of words loaded from linguistic dictionary |
|
dictionary = {} |
|
|
|
wordtype = { |
|
'N': "noun", |
|
'V': "verb", |
|
'P': "pronoun", |
|
'A': "adjective", |
|
'D': "determiner", |
|
'R': "adverb", |
|
'S': "adposition", |
|
'C': "conjonction", |
|
'I': "interjection", |
|
'X': "residual", |
|
'F': "ponctuation", |
|
'?': "unknown" |
|
} |
|
|
|
nountypes = {'c': "common", 'p': "proper", 'k': "cardinal", '-': "None"} |
|
|
|
verbtypes = {'m': "main", 'a': "auxiliary", '-': "None"} |
|
|
|
verbmood = { |
|
'i': "indicative", |
|
's': "subjunctive", |
|
'm': "imperative", |
|
'c': "conditional", |
|
'n': "infinitive", |
|
'p': "participle", |
|
'-': "None" |
|
} |
|
|
|
verbtense = { |
|
'p': "present", |
|
'i': "imperfect", |
|
'f': "future", |
|
's': "past", |
|
'-': "None" |
|
} |
|
|
|
adjtype = { |
|
'f': "qualifiative", |
|
'o': "ordinal", |
|
'k': "cardinal", |
|
'i': "indefinite", |
|
's': "possessive", |
|
'-': "None" |
|
} |
|
|
|
degree = {'p': "positive", 'c': "comparative", 'n': "negative", '-': "None"} |
|
|
|
pronountype = { |
|
'p': "personal", |
|
'd': "demanstrative", |
|
'i': "indefinite", |
|
's': "possessive", |
|
't': "interrogative", |
|
'r': "relative", |
|
'x': "reflexive", |
|
'k': "cardinal", |
|
'-': "None" |
|
} |
|
|
|
pronouncase = { |
|
'n': "nominative", |
|
'a': "accusative", |
|
'd': "dative", |
|
'o': "oblique", |
|
'g': "genitive", |
|
'-': "None" |
|
} |
|
|
|
detertypes = { |
|
'a': "article", |
|
'd': "demonstrative", |
|
's': "possessive", |
|
'i': "indefinite", |
|
't': "iter-excl", |
|
'r': "relative", |
|
'k': "cardinal", |
|
'-': "None" |
|
} |
|
|
|
deternature = {'d': "definite", 'i': "indefinite", '-': "None"} |
|
|
|
advtypes = {'g': "general", 'p': "particle", 'x': "interro-excl", '-': "None"} |
|
|
|
adptypes = {'p': "preposition", 'd': "deictique", '-': "None"} |
|
|
|
conjtypes = {'c': "coordinative", 's': "subortinative", '-': "None"} |
|
|
|
gender = {'m': "masculine", 'f': "feminine", '-': "None"} |
|
|
|
number = {'s': "singular", 'p': "plural", '-': "None"} |
|
|
|
person = {'1': "first", '2': "second", '3': "third", '-': "None"} |
|
|
|
############################################################################### |
|
# classes definition # |
|
############################################################################### |
|
|
|
|
|
class Word: |
|
"""Any word found in the language's dictionary. It contains its plain |
|
string representation, an array of GRACE descriptions, an array of its word |
|
categories, its phonetics and its frequency in the working text. |
|
""" |
|
|
|
def __init__(self, word: str, grammar, phonetics: str): |
|
"""Constructor of the `Word` class. It accepts as arguments a string |
|
(its plain string representation), an array of strings (GRACE |
|
description of the word) and its phonetics as string too. |
|
""" |
|
self.__word = word |
|
self.__grace = grammar |
|
self.__category = [] |
|
for x in grammar: |
|
self.__category.append(x[0]) |
|
self.__phon = phonetics |
|
self.__frequency = 0 |
|
|
|
def __str__(self): |
|
"""Returns stringified `Word`""" |
|
res = self.__word + "\t/" + self.__phon + "/" |
|
for elem in self.__grace: |
|
res += "\n\t" + graceToDebugString(elem) |
|
return res |
|
|
|
def get_word(self): |
|
"""Returns plain string representation of `Word`""" |
|
return self.__word[:] |
|
|
|
def get_grace(self): |
|
"""Returns array of GRACE descriptions (string) of `Word`""" |
|
return self.__grace[:] |
|
|
|
def get_phon(self): |
|
"""Returns phonetics of `Word` as string""" |
|
return self.__phon[:] |
|
|
|
def increase_frequency(self): |
|
"""Increace by one the frequency of `Word`""" |
|
self.__frequency += 1 |
|
|
|
def get_frequency(self): |
|
"""Get frequency of `Word` as number""" |
|
return self.__frequency |
|
|
|
|
|
############################################################################### |
|
# functions definition # |
|
############################################################################### |
|
|
|
|
|
def graceToDebugString(grace: str): |
|
"""Turns into a single string a GRACE description |
|
|
|
For debug purposes only |
|
""" |
|
res = wordtype[grace[0]] |
|
if grace[0] == 'N': |
|
# pass |
|
res += " " + nountypes[grace[1]] |
|
res += " " + gender[grace[2]] |
|
res += " " + number[grace[3]] |
|
elif grace[0] == 'V': |
|
res += " " + verbtypes[grace[1]] |
|
res += " " + verbmood[grace[2]] |
|
res += " " + verbtense[grace[3]] |
|
res += " " + person[grace[4]] |
|
res += " " + number[grace[5]] |
|
res += " " + gender[grace[6]] |
|
elif grace[0] == 'P': |
|
res += " " + pronountype[grace[1]] |
|
res += " " + person[grace[2]] |
|
res += " " + gender[grace[3]] |
|
res += " " + number[grace[4]] |
|
res += " " + pronouncase[grace[5]] |
|
res += " " + number[grace[6]] |
|
|
|
elif grace[0] == 'A': |
|
res += " " + adjtype[grace[1]] |
|
res += " " + degree[grace[2]] |
|
res += " " + gender[grace[3]] |
|
res += " " + number[grace[4]] |
|
elif grace[0] == 'D': |
|
res += " " + detertypes[grace[1]] |
|
res += " " + person[grace[2]] |
|
res += " " + gender[grace[3]] |
|
res += " " + number[grace[4]] |
|
res += " " + number[grace[5]] |
|
res += " " + deternature[grace[6]] |
|
elif grace[0] == 'R': |
|
res += " " + advtypes[grace[1]] |
|
res += " " + degree[grace[2]] |
|
elif grace[0] == 'S': |
|
res += " " + adptypes[grace[1]] |
|
elif grace[0] == 'C': |
|
res += " " + conjtypes[grace[1]] |
|
elif grace[0] == 'I': |
|
pass |
|
elif grace[0] == 'X': |
|
pass |
|
elif grace[0] == 'F': |
|
pass |
|
elif grace[0] == '?': |
|
pass |
|
return res |
|
|
|
|
|
def graceToString(grace: str): |
|
"""Returns an array of strings of explicit GRACE description""" |
|
if grace[0] == 'N': |
|
return [ |
|
wordtype[grace[0]], nountypes[grace[1]], gender[grace[2]], |
|
number[grace[3]] |
|
] |
|
if grace[0] == 'V': |
|
return [ |
|
wordtype[grace[0]], verbtypes[grace[1]], verbmood[grace[2]], |
|
verbtense[grace[3]], person[grace[4]], number[grace[5]], |
|
gender[grace[6]] |
|
] |
|
if grace[0] == 'P': |
|
return [ |
|
wordtype[grace[0]], pronountype[grace[1]], person[grace[2]], |
|
gender[grace[3]], number[grace[4]], pronouncase[grace[5]], |
|
number[grace[6]] |
|
] |
|
|
|
if grace[0] == 'A': |
|
return [ |
|
wordtype[grace[0]], adjtype[grace[1]], degree[grace[2]], |
|
gender[grace[3]], number[grace[4]] |
|
] |
|
if grace[0] == 'D': |
|
return [ |
|
wordtype[grace[0]], detertypes[grace[1]], person[grace[2]], |
|
gender[grace[3]], number[grace[4]], number[grace[5]], |
|
deternature[grace[6]] |
|
] |
|
if grace[0] == 'R': |
|
return [wordtype[grace[0]], advtypes[grace[1]], degree[grace[2]]] |
|
if grace[0] == 'S': |
|
return [wordtype[grace[0]], adptypes[grace[1]]] |
|
if grace[0] == 'C': |
|
return [wordtype[grace[0]], conjtypes[grace[1]]] |
|
return [wordtype[grace[0]]] |
|
|
|
|
|
def init_dictionary(): |
|
"""Loads the language's dictionary, its format being a comma-separated CSV, |
|
in its first collumn the word itself, in the second semicolon-separated |
|
GRACE definition of the word, and in the third collumn the word’s X-SAMPA |
|
phonetics |
|
""" |
|
with open('dico.dictfr', newline='') as csvfile: |
|
spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') |
|
for row in spamreader: |
|
grammar = row[1] |
|
if grammar.find(';') != -1: |
|
grammar = grammar.split(';') |
|
else: |
|
grammar = [grammar] |
|
dictionary[row[0]] = Word(row[0], grammar, row[2]) |
|
|
|
|
|
def clean_txt(txt): |
|
"""This function cleans string given as argument""" |
|
|
|
# Sometimes some words are splitted, this regex re-merge them |
|
txt = word_breaks.sub('', txt) |
|
|
|
# All of :;,'"- characters will be replaced by a space |
|
txt = intermediate_punctuation.sub(' ', txt) |
|
|
|
# All whites characters groups (\n, \r, \t & \s) will be replaced by a |
|
# single space |
|
txt = whites.sub(' ', txt) |
|
|
|
# All whites characters between ending points (!, ? or .) will be removed |
|
txt = blanks_between_points.sub('', txt) |
|
|
|
return txt |
|
|
|
|
|
def filter_text(txt): |
|
"""This function constructs a list of filtered sentences. Each filtered |
|
sentence is itself a list of strings. Each string is a word found in the |
|
dictionary. In the same time a dict is built, containing selected words as |
|
keys and their frequencies as values.""" |
|
filtered_text = [] |
|
for sentence in filter(None, ending_punctuation.split(clean_txt(txt))): |
|
word_list = [] |
|
for word in filter(None, sentence.split()): |
|
lower_word = word.lower() |
|
if lower_word not in dictionary: |
|
dictionary[lower_word] = Word(lower_word, ["?"], "") |
|
word_list.append(lower_word) |
|
dictionary[lower_word].increase_frequency() |
|
if word_list != []: |
|
filtered_text.append(word_list) |
|
return filtered_text |
|
|
|
|
|
def prepare_output(splitted_txt): |
|
text = ET.Element("text") |
|
for elem_sentence in splitted_txt: |
|
sentence = ET.SubElement(text, "sentence") |
|
for elem_word in elem_sentence: |
|
word = dictionary[elem_word] |
|
# First GRACE description used |
|
grace = word.get_grace()[0] |
|
grace_details = graceToString(grace) |
|
try: |
|
if grace[0] == 'N': |
|
ET.SubElement( |
|
sentence, |
|
"word", |
|
frequency=str(word.get_frequency()), |
|
wordclass=grace_details[0], |
|
wordtype=grace_details[1], |
|
gender=grace_details[2], |
|
number=grace_details[3]).text = word.get_word() |
|
elif grace[0] == 'V': |
|
ET.SubElement( |
|
sentence, |
|
"word", |
|
frequency=str(word.get_frequency()), |
|
wordclass=grace_details[0], |
|
wordtype=grace_details[1], |
|
mood=grace_details[2], |
|
tense=grace_details[3], |
|
person=grace_details[4], |
|
number=grace_details[5], |
|
gender=grace_details[6]).text = word.get_word() |
|
elif grace[0] == 'P': |
|
ET.SubElement( |
|
sentence, |
|
"word", |
|
frequency=str(word.get_frequency()), |
|
wordclass=grace_details[0], |
|
wordtype=grace_details[1], |
|
person=grace_details[2], |
|
gender=grace_details[3], |
|
number=grace_details[4], |
|
case=grace_details[5], |
|
possessor=grace_details[6]).text = word.get_word() |
|
elif grace[0] == 'A': |
|
ET.SubElement( |
|
sentence, |
|
"word", |
|
frequency=str(word.get_frequency()), |
|
wordclass=grace_details[0], |
|
wordtype=grace_details[1], |
|
degree=grace_details[2], |
|
gender=grace_details[3], |
|
number=grace_details[4]).text = word.get_word() |
|
elif grace[0] == 'D': |
|
ET.SubElement( |
|
sentence, |
|
"word", |
|
frequency=str(word.get_frequency()), |
|
wordclass=grace_details[0], |
|
wordtype=grace_details[1], |
|
person=grace_details[2], |
|
gender=grace_details[3], |
|
number=grace_details[4], |
|
possessor=grace_details[5], |
|
nature=grace_details[6]).text = word.get_word() |
|
elif grace[0] == 'R': |
|
ET.SubElement( |
|
sentence, |
|
"word", |
|
frequency=str(word.get_frequency()), |
|
wordclass=grace_details[0], |
|
wordtype=grace_details[1], |
|
degree=grace_details[2]).text = word.get_word() |
|
elif grace[0] == 'S' or grace[0] == 'C': |
|
ET.SubElement( |
|
sentence, |
|
"word", |
|
frequency=str(word.get_frequency()), |
|
wordclass=grace_details[0], |
|
wordtype=grace_details[1]).text = word.get_word() |
|
else: |
|
ET.SubElement( |
|
sentence, |
|
"word", |
|
frequency=str(word.get_frequency()), |
|
wordclass=str( |
|
grace_details[0])).text = word.get_word() |
|
except IndexError: |
|
print("Failed on " + word.get_word() + "\tGRACE:" + grace) |
|
return text |
|
|
|
|
|
############################################################################### |
|
# main program # |
|
############################################################################### |
|
|
|
if __name__ == "__main__": |
|
# arg parse ################################################################### |
|
if len(sys.argv) < 2 or filename_end.findall(sys.argv[1]) == []: |
|
exit("Usage: ./parser.py *.txt") |
|
|
|
# load files ################################################################## |
|
print("Loading input file...", end='') |
|
with open(sys.argv[1], 'r') as txt_file: |
|
txt = txt_file.read() |
|
print("\rInput file loaded ") |
|
print("Loading dictionary...", end='') |
|
init_dictionary() |
|
print("\r" + str(len(dictionary)) + " words loaded") |
|
|
|
# now comes the parsing work ################################################## |
|
|
|
filtered_text = filter_text(txt) |
|
root = prepare_output(filtered_text) |
|
tree = ET.ElementTree(root) |
|
tree.write( |
|
filename_end.sub('xml', sys.argv[1]), |
|
xml_declaration=True, |
|
encoding='utf-8', |
|
method="xml") |
|
exit(0)
|
|
|