commit
1245552e24
4 changed files with 1037386 additions and 0 deletions
@ -0,0 +1 @@ |
||||
*.xml |
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,475 @@ |
||||
#!/usr/bin/env python3 |
||||
|
||||
############################################################################### |
||||
# imports # |
||||
############################################################################### |
||||
import csv |
||||
import re |
||||
import sys |
||||
import xml.etree.cElementTree as ET |
||||
|
||||
############################################################################### |
||||
# Regex # |
||||
############################################################################### |
||||
|
||||
# Sometimes some words are splitted, this regex re-merge them. |
||||
blanks_between_points = re.compile('(?<=[\!\?\.]) (?=[\!\?\.])') |
||||
|
||||
# This regex will find groups of ending punctuation (!, ? or .) |
||||
# It will be used to splt text in sentences |
||||
ending_punctuation = re.compile('[\!\?\.]+') |
||||
|
||||
# This regex checks that end of argument given to this program contains at |
||||
# least one alphanumeric character followed by ".txt" characters. It is use |
||||
# twice: |
||||
# - Argument conformity verification |
||||
# - Resulting file creation |
||||
# ("txt" characters of argument are replaced by "xml") |
||||
filename_end = re.compile('(?<=\w\.)txt$') |
||||
|
||||
# This regex is used to replace -, ", ', ;, : & , characters (eventually |
||||
# grouped) by a single space |
||||
intermediate_punctuation = re.compile('[\,\;\:\"\'\-]+') |
||||
|
||||
# Goal of this regex is to replace groups of whites characters |
||||
# (\n, \r, \t, \s) by single spaces |
||||
whites = re.compile('[\n\r\t ]+') |
||||
|
||||
# Sometimes some words are splitted, this regex re-merge them |
||||
word_breaks = re.compile('\-[\n\r]+') |
||||
|
||||
############################################################################### |
||||
# global variables # |
||||
############################################################################### |
||||
|
||||
french_chars = "abcdefghijklmnopqrstuvwxyzàâæÇçéèêëîïôœùûüÿ" |
||||
|
||||
# input text to parse |
||||
txt = "" |
||||
|
||||
# dictionary of words loaded from linguistic dictionary |
||||
dictionary = {} |
||||
|
||||
wordtype = { |
||||
'N': "noun", |
||||
'V': "verb", |
||||
'P': "pronoun", |
||||
'A': "adjective", |
||||
'D': "determiner", |
||||
'R': "adverb", |
||||
'S': "adposition", |
||||
'C': "conjonction", |
||||
'I': "interjection", |
||||
'X': "residual", |
||||
'F': "ponctuation", |
||||
'?': "unknown" |
||||
} |
||||
|
||||
nountypes = {'c': "common", 'p': "proper", 'k': "cardinal", '-': "None"} |
||||
|
||||
verbtypes = {'m': "main", 'a': "auxiliary", '-': "None"} |
||||
|
||||
verbmood = { |
||||
'i': "indicative", |
||||
's': "subjunctive", |
||||
'm': "imperative", |
||||
'c': "conditional", |
||||
'n': "infinitive", |
||||
'p': "participle", |
||||
'-': "None" |
||||
} |
||||
|
||||
verbtense = { |
||||
'p': "present", |
||||
'i': "imperfect", |
||||
'f': "future", |
||||
's': "past", |
||||
'-': "None" |
||||
} |
||||
|
||||
adjtype = { |
||||
'f': "qualifiative", |
||||
'o': "ordinal", |
||||
'k': "cardinal", |
||||
'i': "indefinite", |
||||
's': "possessive", |
||||
'-': "None" |
||||
} |
||||
|
||||
degree = {'p': "positive", 'c': "comparative", 'n': "negative", '-': "None"} |
||||
|
||||
pronountype = { |
||||
'p': "personal", |
||||
'd': "demanstrative", |
||||
'i': "indefinite", |
||||
's': "possessive", |
||||
't': "interrogative", |
||||
'r': "relative", |
||||
'x': "reflexive", |
||||
'k': "cardinal", |
||||
'-': "None" |
||||
} |
||||
|
||||
pronouncase = { |
||||
'n': "nominative", |
||||
'a': "accusative", |
||||
'd': "dative", |
||||
'o': "oblique", |
||||
'g': "genitive", |
||||
'-': "None" |
||||
} |
||||
|
||||
detertypes = { |
||||
'a': "article", |
||||
'd': "demonstrative", |
||||
's': "possessive", |
||||
'i': "indefinite", |
||||
't': "iter-excl", |
||||
'r': "relative", |
||||
'k': "cardinal", |
||||
'-': "None" |
||||
} |
||||
|
||||
deternature = {'d': "definite", 'i': "indefinite", '-': "None"} |
||||
|
||||
advtypes = {'g': "general", 'p': "particle", 'x': "interro-excl", '-': "None"} |
||||
|
||||
adptypes = {'p': "preposition", 'd': "deictique", '-': "None"} |
||||
|
||||
conjtypes = {'c': "coordinative", 's': "subortinative", '-': "None"} |
||||
|
||||
gender = {'m': "masculine", 'f': "feminine", '-': "None"} |
||||
|
||||
number = {'s': "singular", 'p': "plural", '-': "None"} |
||||
|
||||
person = {'1': "first", '2': "second", '3': "third", '-': "None"} |
||||
|
||||
############################################################################### |
||||
# classes definition # |
||||
############################################################################### |
||||
|
||||
|
||||
class Word: |
||||
"""Any word found in the language's dictionary. It contains its plain |
||||
string representation, an array of GRACE descriptions, an array of its word |
||||
categories, its phonetics and its frequency in the working text. |
||||
""" |
||||
|
||||
def __init__(self, word: str, grammar, phonetics: str): |
||||
"""Constructor of the `Word` class. It accepts as arguments a string |
||||
(its plain string representation), an array of strings (GRACE |
||||
description of the word) and its phonetics as string too. |
||||
""" |
||||
self.__word = word |
||||
self.__grace = grammar |
||||
self.__category = [] |
||||
for x in grammar: |
||||
self.__category.append(x[0]) |
||||
self.__phon = phonetics |
||||
self.__frequency = 0 |
||||
|
||||
def __str__(self): |
||||
"""Returns stringified `Word`""" |
||||
res = self.__word + "\t/" + self.__phon + "/" |
||||
for elem in self.__grace: |
||||
res += "\n\t" + graceToDebugString(elem) |
||||
return res |
||||
|
||||
def get_word(self): |
||||
"""Returns plain string representation of `Word`""" |
||||
return self.__word[:] |
||||
|
||||
def get_grace(self): |
||||
"""Returns array of GRACE descriptions (string) of `Word`""" |
||||
return self.__grace[:] |
||||
|
||||
def get_phon(self): |
||||
"""Returns phonetics of `Word` as string""" |
||||
return self.__phon[:] |
||||
|
||||
def increase_frequency(self): |
||||
"""Increace by one the frequency of `Word`""" |
||||
self.__frequency += 1 |
||||
|
||||
def get_frequency(self): |
||||
"""Get frequency of `Word` as number""" |
||||
return self.__frequency |
||||
|
||||
|
||||
############################################################################### |
||||
# functions definition # |
||||
############################################################################### |
||||
|
||||
|
||||
def graceToDebugString(grace: str): |
||||
"""Turns into a single string a GRACE description |
||||
|
||||
For debug purposes only |
||||
""" |
||||
res = wordtype[grace[0]] |
||||
if grace[0] == 'N': |
||||
# pass |
||||
res += " " + nountypes[grace[1]] |
||||
res += " " + gender[grace[2]] |
||||
res += " " + number[grace[3]] |
||||
elif grace[0] == 'V': |
||||
res += " " + verbtypes[grace[1]] |
||||
res += " " + verbmood[grace[2]] |
||||
res += " " + verbtense[grace[3]] |
||||
res += " " + person[grace[4]] |
||||
res += " " + number[grace[5]] |
||||
res += " " + gender[grace[6]] |
||||
elif grace[0] == 'P': |
||||
res += " " + pronountype[grace[1]] |
||||
res += " " + person[grace[2]] |
||||
res += " " + gender[grace[3]] |
||||
res += " " + number[grace[4]] |
||||
res += " " + pronouncase[grace[5]] |
||||
res += " " + number[grace[6]] |
||||
|
||||
elif grace[0] == 'A': |
||||
res += " " + adjtype[grace[1]] |
||||
res += " " + degree[grace[2]] |
||||
res += " " + gender[grace[3]] |
||||
res += " " + number[grace[4]] |
||||
elif grace[0] == 'D': |
||||
res += " " + detertypes[grace[1]] |
||||
res += " " + person[grace[2]] |
||||
res += " " + gender[grace[3]] |
||||
res += " " + number[grace[4]] |
||||
res += " " + number[grace[5]] |
||||
res += " " + deternature[grace[6]] |
||||
elif grace[0] == 'R': |
||||
res += " " + advtypes[grace[1]] |
||||
res += " " + degree[grace[2]] |
||||
elif grace[0] == 'S': |
||||
res += " " + adptypes[grace[1]] |
||||
elif grace[0] == 'C': |
||||
res += " " + conjtypes[grace[1]] |
||||
elif grace[0] == 'I': |
||||
pass |
||||
elif grace[0] == 'X': |
||||
pass |
||||
elif grace[0] == 'F': |
||||
pass |
||||
elif grace[0] == '?': |
||||
pass |
||||
return res |
||||
|
||||
|
||||
def graceToString(grace: str): |
||||
"""Returns an array of strings of explicit GRACE description""" |
||||
if grace[0] == 'N': |
||||
return [ |
||||
wordtype[grace[0]], nountypes[grace[1]], gender[grace[2]], |
||||
number[grace[3]] |
||||
] |
||||
if grace[0] == 'V': |
||||
return [ |
||||
wordtype[grace[0]], verbtypes[grace[1]], verbmood[grace[2]], |
||||
verbtense[grace[3]], person[grace[4]], number[grace[5]], |
||||
gender[grace[6]] |
||||
] |
||||
if grace[0] == 'P': |
||||
return [ |
||||
wordtype[grace[0]], pronountype[grace[1]], person[grace[2]], |
||||
gender[grace[3]], number[grace[4]], pronouncase[grace[5]], |
||||
number[grace[6]] |
||||
] |
||||
|
||||
if grace[0] == 'A': |
||||
return [ |
||||
wordtype[grace[0]], adjtype[grace[1]], degree[grace[2]], |
||||
gender[grace[3]], number[grace[4]] |
||||
] |
||||
if grace[0] == 'D': |
||||
return [ |
||||
wordtype[grace[0]], detertypes[grace[1]], person[grace[2]], |
||||
gender[grace[3]], number[grace[4]], number[grace[5]], |
||||
deternature[grace[6]] |
||||
] |
||||
if grace[0] == 'R': |
||||
return [wordtype[grace[0]], advtypes[grace[1]], degree[grace[2]]] |
||||
if grace[0] == 'S': |
||||
return [wordtype[grace[0]], adptypes[grace[1]]] |
||||
if grace[0] == 'C': |
||||
return [wordtype[grace[0]], conjtypes[grace[1]]] |
||||
return [wordtype[grace[0]]] |
||||
|
||||
|
||||
def init_dictionary(): |
||||
"""Loads the language's dictionary, its format being a comma-separated CSV, |
||||
in its first collumn the word itself, in the second semicolon-separated |
||||
GRACE definition of the word, and in the third collumn the word’s X-SAMPA |
||||
phonetics |
||||
""" |
||||
with open('dico.dictfr', newline='') as csvfile: |
||||
spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') |
||||
for row in spamreader: |
||||
grammar = row[1] |
||||
if grammar.find(';') != -1: |
||||
grammar = grammar.split(';') |
||||
else: |
||||
grammar = [grammar] |
||||
dictionary[row[0]] = Word(row[0], grammar, row[2]) |
||||
|
||||
|
||||
def clean_txt(txt): |
||||
"""This function cleans string given as argument""" |
||||
|
||||
# Sometimes some words are splitted, this regex re-merge them |
||||
txt = word_breaks.sub('', txt) |
||||
|
||||
# All of :;,'"- characters will be replaced by a space |
||||
txt = intermediate_punctuation.sub(' ', txt) |
||||
|
||||
# All whites characters groups (\n, \r, \t & \s) will be replaced by a |
||||
# single space |
||||
txt = whites.sub(' ', txt) |
||||
|
||||
# All whites characters between ending points (!, ? or .) will be removed |
||||
txt = blanks_between_points.sub('', txt) |
||||
|
||||
return txt |
||||
|
||||
|
||||
def filter_text(txt): |
||||
"""This function constructs a list of filtered sentences. Each filtered |
||||
sentence is itself a list of strings. Each string is a word found in the |
||||
dictionary. In the same time a dict is built, containing selected words as |
||||
keys and their frequencies as values.""" |
||||
filtered_text = [] |
||||
for sentence in filter(None, ending_punctuation.split(clean_txt(txt))): |
||||
word_list = [] |
||||
for word in filter(None, sentence.split()): |
||||
lower_word = word.lower() |
||||
if lower_word not in dictionary: |
||||
dictionary[lower_word] = Word(lower_word, ["?"], "") |
||||
word_list.append(lower_word) |
||||
dictionary[lower_word].increase_frequency() |
||||
if word_list != []: |
||||
filtered_text.append(word_list) |
||||
return filtered_text |
||||
|
||||
|
||||
def prepare_output(splitted_txt): |
||||
text = ET.Element("text") |
||||
for elem_sentence in splitted_txt: |
||||
sentence = ET.SubElement(text, "sentence") |
||||
for elem_word in elem_sentence: |
||||
word = dictionary[elem_word] |
||||
# First GRACE description used |
||||
grace = word.get_grace()[0] |
||||
grace_details = graceToString(grace) |
||||
try: |
||||
if grace[0] == 'N': |
||||
ET.SubElement( |
||||
sentence, |
||||
"word", |
||||
frequency=str(word.get_frequency()), |
||||
wordclass=grace_details[0], |
||||
wordtype=grace_details[1], |
||||
gender=grace_details[2], |
||||
number=grace_details[3]).text = word.get_word() |
||||
elif grace[0] == 'V': |
||||
ET.SubElement( |
||||
sentence, |
||||
"word", |
||||
frequency=str(word.get_frequency()), |
||||
wordclass=grace_details[0], |
||||
wordtype=grace_details[1], |
||||
mood=grace_details[2], |
||||
tense=grace_details[3], |
||||
person=grace_details[4], |
||||
number=grace_details[5], |
||||
gender=grace_details[6]).text = word.get_word() |
||||
elif grace[0] == 'P': |
||||
ET.SubElement( |
||||
sentence, |
||||
"word", |
||||
frequency=str(word.get_frequency()), |
||||
wordclass=grace_details[0], |
||||
wordtype=grace_details[1], |
||||
person=grace_details[2], |
||||
gender=grace_details[3], |
||||
number=grace_details[4], |
||||
case=grace_details[5], |
||||
possessor=grace_details[6]).text = word.get_word() |
||||
elif grace[0] == 'A': |
||||
ET.SubElement( |
||||
sentence, |
||||
"word", |
||||
frequency=str(word.get_frequency()), |
||||
wordclass=grace_details[0], |
||||
wordtype=grace_details[1], |
||||
degree=grace_details[2], |
||||
gender=grace_details[3], |
||||
number=grace_details[4]).text = word.get_word() |
||||
elif grace[0] == 'D': |
||||
ET.SubElement( |
||||
sentence, |
||||
"word", |
||||
frequency=str(word.get_frequency()), |
||||
wordclass=grace_details[0], |
||||
wordtype=grace_details[1], |
||||
person=grace_details[2], |
||||
gender=grace_details[3], |
||||
number=grace_details[4], |
||||
possessor=grace_details[5], |
||||
nature=grace_details[6]).text = word.get_word() |
||||
elif grace[0] == 'R': |
||||
ET.SubElement( |
||||
sentence, |
||||
"word", |
||||
frequency=str(word.get_frequency()), |
||||
wordclass=grace_details[0], |
||||
wordtype=grace_details[1], |
||||
degree=grace_details[2]).text = word.get_word() |
||||
elif grace[0] == 'S' or grace[0] == 'C': |
||||
ET.SubElement( |
||||
sentence, |
||||
"word", |
||||
frequency=str(word.get_frequency()), |
||||
wordclass=grace_details[0], |
||||
wordtype=grace_details[1]).text = word.get_word() |
||||
else: |
||||
ET.SubElement( |
||||
sentence, |
||||
"word", |
||||
frequency=str(word.get_frequency()), |
||||
wordclass=str( |
||||
grace_details[0])).text = word.get_word() |
||||
except IndexError: |
||||
print("Failed on " + word.get_word() + "\tGRACE:" + grace) |
||||
return text |
||||
|
||||
|
||||
############################################################################### |
||||
# main program # |
||||
############################################################################### |
||||
|
||||
if __name__ == "__main__": |
||||
# arg parse ################################################################### |
||||
if len(sys.argv) < 2 or filename_end.findall(sys.argv[1]) == []: |
||||
exit("Usage: ./parser.py *.txt") |
||||
|
||||
# load files ################################################################## |
||||
print("Loading input file...", end='') |
||||
with open(sys.argv[1], 'r') as txt_file: |
||||
txt = txt_file.read() |
||||
print("\rInput file loaded ") |
||||
print("Loading dictionary...", end='') |
||||
init_dictionary() |
||||
print("\r" + str(len(dictionary)) + " words loaded") |
||||
|
||||
# now comes the parsing work ################################################## |
||||
|
||||
filtered_text = filter_text(txt) |
||||
root = prepare_output(filtered_text) |
||||
tree = ET.ElementTree(root) |
||||
tree.write( |
||||
filename_end.sub('xml', sys.argv[1]), |
||||
xml_declaration=True, |
||||
encoding='utf-8', |
||||
method="xml") |
||||
exit(0) |
Loading…
Reference in new issue