forgot to set up a git repo, so here it is

This commit is contained in:
Phuntsok Drak-pa 2019-03-28 19:42:09 +01:00
commit 1245552e24
4 changed files with 1037386 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
*.xml

1022663
dico.dictfr Normal file

File diff suppressed because it is too large Load Diff

14247
les_miserables_tomeI.txt Normal file

File diff suppressed because it is too large Load Diff

475
parser.py Executable file
View File

@ -0,0 +1,475 @@
#!/usr/bin/env python3
###############################################################################
# imports #
###############################################################################
import csv
import re
import sys
import xml.etree.cElementTree as ET
###############################################################################
# Regex #
###############################################################################
# Sometimes some words are splitted, this regex re-merge them.
blanks_between_points = re.compile('(?<=[\!\?\.]) (?=[\!\?\.])')
# This regex will find groups of ending punctuation (!, ? or .)
# It will be used to splt text in sentences
ending_punctuation = re.compile('[\!\?\.]+')
# This regex checks that end of argument given to this program contains at
# least one alphanumeric character followed by ".txt" characters. It is use
# twice:
# - Argument conformity verification
# - Resulting file creation
# ("txt" characters of argument are replaced by "xml")
filename_end = re.compile('(?<=\w\.)txt$')
# This regex is used to replace -, ", ', ;, : & , characters (eventually
# grouped) by a single space
intermediate_punctuation = re.compile('[\,\;\:\"\'\-]+')
# Goal of this regex is to replace groups of whites characters
# (\n, \r, \t, \s) by single spaces
whites = re.compile('[\n\r\t ]+')
# Sometimes some words are splitted, this regex re-merge them
word_breaks = re.compile('\-[\n\r]+')
###############################################################################
# global variables #
###############################################################################
french_chars = "abcdefghijklmnopqrstuvwxyzàâæÇçéèêëîïôœùûüÿ"
# input text to parse
txt = ""
# dictionary of words loaded from linguistic dictionary
dictionary = {}
wordtype = {
'N': "noun",
'V': "verb",
'P': "pronoun",
'A': "adjective",
'D': "determiner",
'R': "adverb",
'S': "adposition",
'C': "conjonction",
'I': "interjection",
'X': "residual",
'F': "ponctuation",
'?': "unknown"
}
nountypes = {'c': "common", 'p': "proper", 'k': "cardinal", '-': "None"}
verbtypes = {'m': "main", 'a': "auxiliary", '-': "None"}
verbmood = {
'i': "indicative",
's': "subjunctive",
'm': "imperative",
'c': "conditional",
'n': "infinitive",
'p': "participle",
'-': "None"
}
verbtense = {
'p': "present",
'i': "imperfect",
'f': "future",
's': "past",
'-': "None"
}
adjtype = {
'f': "qualifiative",
'o': "ordinal",
'k': "cardinal",
'i': "indefinite",
's': "possessive",
'-': "None"
}
degree = {'p': "positive", 'c': "comparative", 'n': "negative", '-': "None"}
pronountype = {
'p': "personal",
'd': "demanstrative",
'i': "indefinite",
's': "possessive",
't': "interrogative",
'r': "relative",
'x': "reflexive",
'k': "cardinal",
'-': "None"
}
pronouncase = {
'n': "nominative",
'a': "accusative",
'd': "dative",
'o': "oblique",
'g': "genitive",
'-': "None"
}
detertypes = {
'a': "article",
'd': "demonstrative",
's': "possessive",
'i': "indefinite",
't': "iter-excl",
'r': "relative",
'k': "cardinal",
'-': "None"
}
deternature = {'d': "definite", 'i': "indefinite", '-': "None"}
advtypes = {'g': "general", 'p': "particle", 'x': "interro-excl", '-': "None"}
adptypes = {'p': "preposition", 'd': "deictique", '-': "None"}
conjtypes = {'c': "coordinative", 's': "subortinative", '-': "None"}
gender = {'m': "masculine", 'f': "feminine", '-': "None"}
number = {'s': "singular", 'p': "plural", '-': "None"}
person = {'1': "first", '2': "second", '3': "third", '-': "None"}
###############################################################################
# classes definition #
###############################################################################
class Word:
"""Any word found in the language's dictionary. It contains its plain
string representation, an array of GRACE descriptions, an array of its word
categories, its phonetics and its frequency in the working text.
"""
def __init__(self, word: str, grammar, phonetics: str):
"""Constructor of the `Word` class. It accepts as arguments a string
(its plain string representation), an array of strings (GRACE
description of the word) and its phonetics as string too.
"""
self.__word = word
self.__grace = grammar
self.__category = []
for x in grammar:
self.__category.append(x[0])
self.__phon = phonetics
self.__frequency = 0
def __str__(self):
"""Returns stringified `Word`"""
res = self.__word + "\t/" + self.__phon + "/"
for elem in self.__grace:
res += "\n\t" + graceToDebugString(elem)
return res
def get_word(self):
"""Returns plain string representation of `Word`"""
return self.__word[:]
def get_grace(self):
"""Returns array of GRACE descriptions (string) of `Word`"""
return self.__grace[:]
def get_phon(self):
"""Returns phonetics of `Word` as string"""
return self.__phon[:]
def increase_frequency(self):
"""Increace by one the frequency of `Word`"""
self.__frequency += 1
def get_frequency(self):
"""Get frequency of `Word` as number"""
return self.__frequency
###############################################################################
# functions definition #
###############################################################################
def graceToDebugString(grace: str):
"""Turns into a single string a GRACE description
For debug purposes only
"""
res = wordtype[grace[0]]
if grace[0] == 'N':
# pass
res += " " + nountypes[grace[1]]
res += " " + gender[grace[2]]
res += " " + number[grace[3]]
elif grace[0] == 'V':
res += " " + verbtypes[grace[1]]
res += " " + verbmood[grace[2]]
res += " " + verbtense[grace[3]]
res += " " + person[grace[4]]
res += " " + number[grace[5]]
res += " " + gender[grace[6]]
elif grace[0] == 'P':
res += " " + pronountype[grace[1]]
res += " " + person[grace[2]]
res += " " + gender[grace[3]]
res += " " + number[grace[4]]
res += " " + pronouncase[grace[5]]
res += " " + number[grace[6]]
elif grace[0] == 'A':
res += " " + adjtype[grace[1]]
res += " " + degree[grace[2]]
res += " " + gender[grace[3]]
res += " " + number[grace[4]]
elif grace[0] == 'D':
res += " " + detertypes[grace[1]]
res += " " + person[grace[2]]
res += " " + gender[grace[3]]
res += " " + number[grace[4]]
res += " " + number[grace[5]]
res += " " + deternature[grace[6]]
elif grace[0] == 'R':
res += " " + advtypes[grace[1]]
res += " " + degree[grace[2]]
elif grace[0] == 'S':
res += " " + adptypes[grace[1]]
elif grace[0] == 'C':
res += " " + conjtypes[grace[1]]
elif grace[0] == 'I':
pass
elif grace[0] == 'X':
pass
elif grace[0] == 'F':
pass
elif grace[0] == '?':
pass
return res
def graceToString(grace: str):
"""Returns an array of strings of explicit GRACE description"""
if grace[0] == 'N':
return [
wordtype[grace[0]], nountypes[grace[1]], gender[grace[2]],
number[grace[3]]
]
if grace[0] == 'V':
return [
wordtype[grace[0]], verbtypes[grace[1]], verbmood[grace[2]],
verbtense[grace[3]], person[grace[4]], number[grace[5]],
gender[grace[6]]
]
if grace[0] == 'P':
return [
wordtype[grace[0]], pronountype[grace[1]], person[grace[2]],
gender[grace[3]], number[grace[4]], pronouncase[grace[5]],
number[grace[6]]
]
if grace[0] == 'A':
return [
wordtype[grace[0]], adjtype[grace[1]], degree[grace[2]],
gender[grace[3]], number[grace[4]]
]
if grace[0] == 'D':
return [
wordtype[grace[0]], detertypes[grace[1]], person[grace[2]],
gender[grace[3]], number[grace[4]], number[grace[5]],
deternature[grace[6]]
]
if grace[0] == 'R':
return [wordtype[grace[0]], advtypes[grace[1]], degree[grace[2]]]
if grace[0] == 'S':
return [wordtype[grace[0]], adptypes[grace[1]]]
if grace[0] == 'C':
return [wordtype[grace[0]], conjtypes[grace[1]]]
return [wordtype[grace[0]]]
def init_dictionary():
"""Loads the language's dictionary, its format being a comma-separated CSV,
in its first collumn the word itself, in the second semicolon-separated
GRACE definition of the word, and in the third collumn the words X-SAMPA
phonetics
"""
with open('dico.dictfr', newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in spamreader:
grammar = row[1]
if grammar.find(';') != -1:
grammar = grammar.split(';')
else:
grammar = [grammar]
dictionary[row[0]] = Word(row[0], grammar, row[2])
def clean_txt(txt):
"""This function cleans string given as argument"""
# Sometimes some words are splitted, this regex re-merge them
txt = word_breaks.sub('', txt)
# All of :;,'"- characters will be replaced by a space
txt = intermediate_punctuation.sub(' ', txt)
# All whites characters groups (\n, \r, \t & \s) will be replaced by a
# single space
txt = whites.sub(' ', txt)
# All whites characters between ending points (!, ? or .) will be removed
txt = blanks_between_points.sub('', txt)
return txt
def filter_text(txt):
"""This function constructs a list of filtered sentences. Each filtered
sentence is itself a list of strings. Each string is a word found in the
dictionary. In the same time a dict is built, containing selected words as
keys and their frequencies as values."""
filtered_text = []
for sentence in filter(None, ending_punctuation.split(clean_txt(txt))):
word_list = []
for word in filter(None, sentence.split()):
lower_word = word.lower()
if lower_word not in dictionary:
dictionary[lower_word] = Word(lower_word, ["?"], "")
word_list.append(lower_word)
dictionary[lower_word].increase_frequency()
if word_list != []:
filtered_text.append(word_list)
return filtered_text
def prepare_output(splitted_txt):
text = ET.Element("text")
for elem_sentence in splitted_txt:
sentence = ET.SubElement(text, "sentence")
for elem_word in elem_sentence:
word = dictionary[elem_word]
# First GRACE description used
grace = word.get_grace()[0]
grace_details = graceToString(grace)
try:
if grace[0] == 'N':
ET.SubElement(
sentence,
"word",
frequency=str(word.get_frequency()),
wordclass=grace_details[0],
wordtype=grace_details[1],
gender=grace_details[2],
number=grace_details[3]).text = word.get_word()
elif grace[0] == 'V':
ET.SubElement(
sentence,
"word",
frequency=str(word.get_frequency()),
wordclass=grace_details[0],
wordtype=grace_details[1],
mood=grace_details[2],
tense=grace_details[3],
person=grace_details[4],
number=grace_details[5],
gender=grace_details[6]).text = word.get_word()
elif grace[0] == 'P':
ET.SubElement(
sentence,
"word",
frequency=str(word.get_frequency()),
wordclass=grace_details[0],
wordtype=grace_details[1],
person=grace_details[2],
gender=grace_details[3],
number=grace_details[4],
case=grace_details[5],
possessor=grace_details[6]).text = word.get_word()
elif grace[0] == 'A':
ET.SubElement(
sentence,
"word",
frequency=str(word.get_frequency()),
wordclass=grace_details[0],
wordtype=grace_details[1],
degree=grace_details[2],
gender=grace_details[3],
number=grace_details[4]).text = word.get_word()
elif grace[0] == 'D':
ET.SubElement(
sentence,
"word",
frequency=str(word.get_frequency()),
wordclass=grace_details[0],
wordtype=grace_details[1],
person=grace_details[2],
gender=grace_details[3],
number=grace_details[4],
possessor=grace_details[5],
nature=grace_details[6]).text = word.get_word()
elif grace[0] == 'R':
ET.SubElement(
sentence,
"word",
frequency=str(word.get_frequency()),
wordclass=grace_details[0],
wordtype=grace_details[1],
degree=grace_details[2]).text = word.get_word()
elif grace[0] == 'S' or grace[0] == 'C':
ET.SubElement(
sentence,
"word",
frequency=str(word.get_frequency()),
wordclass=grace_details[0],
wordtype=grace_details[1]).text = word.get_word()
else:
ET.SubElement(
sentence,
"word",
frequency=str(word.get_frequency()),
wordclass=str(
grace_details[0])).text = word.get_word()
except IndexError:
print("Failed on " + word.get_word() + "\tGRACE:" + grace)
return text
###############################################################################
# main program #
###############################################################################
if __name__ == "__main__":
# arg parse ###################################################################
if len(sys.argv) < 2 or filename_end.findall(sys.argv[1]) == []:
exit("Usage: ./parser.py *.txt")
# load files ##################################################################
print("Loading input file...", end='')
with open(sys.argv[1], 'r') as txt_file:
txt = txt_file.read()
print("\rInput file loaded ")
print("Loading dictionary...", end='')
init_dictionary()
print("\r" + str(len(dictionary)) + " words loaded")
# now comes the parsing work ##################################################
filtered_text = filter_text(txt)
root = prepare_output(filtered_text)
tree = ET.ElementTree(root)
tree.write(
filename_end.sub('xml', sys.argv[1]),
xml_declaration=True,
encoding='utf-8',
method="xml")
exit(0)