forgot to set up a git repo, so here it is
This commit is contained in:
commit
1245552e24
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
*.xml
|
1022663
dico.dictfr
Normal file
1022663
dico.dictfr
Normal file
File diff suppressed because it is too large
Load Diff
14247
les_miserables_tomeI.txt
Normal file
14247
les_miserables_tomeI.txt
Normal file
File diff suppressed because it is too large
Load Diff
475
parser.py
Executable file
475
parser.py
Executable file
@ -0,0 +1,475 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# imports #
|
||||||
|
###############################################################################
|
||||||
|
import csv
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import xml.etree.cElementTree as ET
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Regex #
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
# Sometimes some words are splitted, this regex re-merge them.
|
||||||
|
blanks_between_points = re.compile('(?<=[\!\?\.]) (?=[\!\?\.])')
|
||||||
|
|
||||||
|
# This regex will find groups of ending punctuation (!, ? or .)
|
||||||
|
# It will be used to splt text in sentences
|
||||||
|
ending_punctuation = re.compile('[\!\?\.]+')
|
||||||
|
|
||||||
|
# This regex checks that end of argument given to this program contains at
|
||||||
|
# least one alphanumeric character followed by ".txt" characters. It is use
|
||||||
|
# twice:
|
||||||
|
# - Argument conformity verification
|
||||||
|
# - Resulting file creation
|
||||||
|
# ("txt" characters of argument are replaced by "xml")
|
||||||
|
filename_end = re.compile('(?<=\w\.)txt$')
|
||||||
|
|
||||||
|
# This regex is used to replace -, ", ', ;, : & , characters (eventually
|
||||||
|
# grouped) by a single space
|
||||||
|
intermediate_punctuation = re.compile('[\,\;\:\"\'\-]+')
|
||||||
|
|
||||||
|
# Goal of this regex is to replace groups of whites characters
|
||||||
|
# (\n, \r, \t, \s) by single spaces
|
||||||
|
whites = re.compile('[\n\r\t ]+')
|
||||||
|
|
||||||
|
# Sometimes some words are splitted, this regex re-merge them
|
||||||
|
word_breaks = re.compile('\-[\n\r]+')
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# global variables #
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
french_chars = "abcdefghijklmnopqrstuvwxyzàâæÇçéèêëîïôœùûüÿ"
|
||||||
|
|
||||||
|
# input text to parse
|
||||||
|
txt = ""
|
||||||
|
|
||||||
|
# dictionary of words loaded from linguistic dictionary
|
||||||
|
dictionary = {}
|
||||||
|
|
||||||
|
wordtype = {
|
||||||
|
'N': "noun",
|
||||||
|
'V': "verb",
|
||||||
|
'P': "pronoun",
|
||||||
|
'A': "adjective",
|
||||||
|
'D': "determiner",
|
||||||
|
'R': "adverb",
|
||||||
|
'S': "adposition",
|
||||||
|
'C': "conjonction",
|
||||||
|
'I': "interjection",
|
||||||
|
'X': "residual",
|
||||||
|
'F': "ponctuation",
|
||||||
|
'?': "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
nountypes = {'c': "common", 'p': "proper", 'k': "cardinal", '-': "None"}
|
||||||
|
|
||||||
|
verbtypes = {'m': "main", 'a': "auxiliary", '-': "None"}
|
||||||
|
|
||||||
|
verbmood = {
|
||||||
|
'i': "indicative",
|
||||||
|
's': "subjunctive",
|
||||||
|
'm': "imperative",
|
||||||
|
'c': "conditional",
|
||||||
|
'n': "infinitive",
|
||||||
|
'p': "participle",
|
||||||
|
'-': "None"
|
||||||
|
}
|
||||||
|
|
||||||
|
verbtense = {
|
||||||
|
'p': "present",
|
||||||
|
'i': "imperfect",
|
||||||
|
'f': "future",
|
||||||
|
's': "past",
|
||||||
|
'-': "None"
|
||||||
|
}
|
||||||
|
|
||||||
|
adjtype = {
|
||||||
|
'f': "qualifiative",
|
||||||
|
'o': "ordinal",
|
||||||
|
'k': "cardinal",
|
||||||
|
'i': "indefinite",
|
||||||
|
's': "possessive",
|
||||||
|
'-': "None"
|
||||||
|
}
|
||||||
|
|
||||||
|
degree = {'p': "positive", 'c': "comparative", 'n': "negative", '-': "None"}
|
||||||
|
|
||||||
|
pronountype = {
|
||||||
|
'p': "personal",
|
||||||
|
'd': "demanstrative",
|
||||||
|
'i': "indefinite",
|
||||||
|
's': "possessive",
|
||||||
|
't': "interrogative",
|
||||||
|
'r': "relative",
|
||||||
|
'x': "reflexive",
|
||||||
|
'k': "cardinal",
|
||||||
|
'-': "None"
|
||||||
|
}
|
||||||
|
|
||||||
|
pronouncase = {
|
||||||
|
'n': "nominative",
|
||||||
|
'a': "accusative",
|
||||||
|
'd': "dative",
|
||||||
|
'o': "oblique",
|
||||||
|
'g': "genitive",
|
||||||
|
'-': "None"
|
||||||
|
}
|
||||||
|
|
||||||
|
detertypes = {
|
||||||
|
'a': "article",
|
||||||
|
'd': "demonstrative",
|
||||||
|
's': "possessive",
|
||||||
|
'i': "indefinite",
|
||||||
|
't': "iter-excl",
|
||||||
|
'r': "relative",
|
||||||
|
'k': "cardinal",
|
||||||
|
'-': "None"
|
||||||
|
}
|
||||||
|
|
||||||
|
deternature = {'d': "definite", 'i': "indefinite", '-': "None"}
|
||||||
|
|
||||||
|
advtypes = {'g': "general", 'p': "particle", 'x': "interro-excl", '-': "None"}
|
||||||
|
|
||||||
|
adptypes = {'p': "preposition", 'd': "deictique", '-': "None"}
|
||||||
|
|
||||||
|
conjtypes = {'c': "coordinative", 's': "subortinative", '-': "None"}
|
||||||
|
|
||||||
|
gender = {'m': "masculine", 'f': "feminine", '-': "None"}
|
||||||
|
|
||||||
|
number = {'s': "singular", 'p': "plural", '-': "None"}
|
||||||
|
|
||||||
|
person = {'1': "first", '2': "second", '3': "third", '-': "None"}
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# classes definition #
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
class Word:
|
||||||
|
"""Any word found in the language's dictionary. It contains its plain
|
||||||
|
string representation, an array of GRACE descriptions, an array of its word
|
||||||
|
categories, its phonetics and its frequency in the working text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, word: str, grammar, phonetics: str):
|
||||||
|
"""Constructor of the `Word` class. It accepts as arguments a string
|
||||||
|
(its plain string representation), an array of strings (GRACE
|
||||||
|
description of the word) and its phonetics as string too.
|
||||||
|
"""
|
||||||
|
self.__word = word
|
||||||
|
self.__grace = grammar
|
||||||
|
self.__category = []
|
||||||
|
for x in grammar:
|
||||||
|
self.__category.append(x[0])
|
||||||
|
self.__phon = phonetics
|
||||||
|
self.__frequency = 0
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
"""Returns stringified `Word`"""
|
||||||
|
res = self.__word + "\t/" + self.__phon + "/"
|
||||||
|
for elem in self.__grace:
|
||||||
|
res += "\n\t" + graceToDebugString(elem)
|
||||||
|
return res
|
||||||
|
|
||||||
|
def get_word(self):
|
||||||
|
"""Returns plain string representation of `Word`"""
|
||||||
|
return self.__word[:]
|
||||||
|
|
||||||
|
def get_grace(self):
|
||||||
|
"""Returns array of GRACE descriptions (string) of `Word`"""
|
||||||
|
return self.__grace[:]
|
||||||
|
|
||||||
|
def get_phon(self):
|
||||||
|
"""Returns phonetics of `Word` as string"""
|
||||||
|
return self.__phon[:]
|
||||||
|
|
||||||
|
def increase_frequency(self):
|
||||||
|
"""Increace by one the frequency of `Word`"""
|
||||||
|
self.__frequency += 1
|
||||||
|
|
||||||
|
def get_frequency(self):
|
||||||
|
"""Get frequency of `Word` as number"""
|
||||||
|
return self.__frequency
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# functions definition #
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
def graceToDebugString(grace: str):
|
||||||
|
"""Turns into a single string a GRACE description
|
||||||
|
|
||||||
|
For debug purposes only
|
||||||
|
"""
|
||||||
|
res = wordtype[grace[0]]
|
||||||
|
if grace[0] == 'N':
|
||||||
|
# pass
|
||||||
|
res += " " + nountypes[grace[1]]
|
||||||
|
res += " " + gender[grace[2]]
|
||||||
|
res += " " + number[grace[3]]
|
||||||
|
elif grace[0] == 'V':
|
||||||
|
res += " " + verbtypes[grace[1]]
|
||||||
|
res += " " + verbmood[grace[2]]
|
||||||
|
res += " " + verbtense[grace[3]]
|
||||||
|
res += " " + person[grace[4]]
|
||||||
|
res += " " + number[grace[5]]
|
||||||
|
res += " " + gender[grace[6]]
|
||||||
|
elif grace[0] == 'P':
|
||||||
|
res += " " + pronountype[grace[1]]
|
||||||
|
res += " " + person[grace[2]]
|
||||||
|
res += " " + gender[grace[3]]
|
||||||
|
res += " " + number[grace[4]]
|
||||||
|
res += " " + pronouncase[grace[5]]
|
||||||
|
res += " " + number[grace[6]]
|
||||||
|
|
||||||
|
elif grace[0] == 'A':
|
||||||
|
res += " " + adjtype[grace[1]]
|
||||||
|
res += " " + degree[grace[2]]
|
||||||
|
res += " " + gender[grace[3]]
|
||||||
|
res += " " + number[grace[4]]
|
||||||
|
elif grace[0] == 'D':
|
||||||
|
res += " " + detertypes[grace[1]]
|
||||||
|
res += " " + person[grace[2]]
|
||||||
|
res += " " + gender[grace[3]]
|
||||||
|
res += " " + number[grace[4]]
|
||||||
|
res += " " + number[grace[5]]
|
||||||
|
res += " " + deternature[grace[6]]
|
||||||
|
elif grace[0] == 'R':
|
||||||
|
res += " " + advtypes[grace[1]]
|
||||||
|
res += " " + degree[grace[2]]
|
||||||
|
elif grace[0] == 'S':
|
||||||
|
res += " " + adptypes[grace[1]]
|
||||||
|
elif grace[0] == 'C':
|
||||||
|
res += " " + conjtypes[grace[1]]
|
||||||
|
elif grace[0] == 'I':
|
||||||
|
pass
|
||||||
|
elif grace[0] == 'X':
|
||||||
|
pass
|
||||||
|
elif grace[0] == 'F':
|
||||||
|
pass
|
||||||
|
elif grace[0] == '?':
|
||||||
|
pass
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def graceToString(grace: str):
|
||||||
|
"""Returns an array of strings of explicit GRACE description"""
|
||||||
|
if grace[0] == 'N':
|
||||||
|
return [
|
||||||
|
wordtype[grace[0]], nountypes[grace[1]], gender[grace[2]],
|
||||||
|
number[grace[3]]
|
||||||
|
]
|
||||||
|
if grace[0] == 'V':
|
||||||
|
return [
|
||||||
|
wordtype[grace[0]], verbtypes[grace[1]], verbmood[grace[2]],
|
||||||
|
verbtense[grace[3]], person[grace[4]], number[grace[5]],
|
||||||
|
gender[grace[6]]
|
||||||
|
]
|
||||||
|
if grace[0] == 'P':
|
||||||
|
return [
|
||||||
|
wordtype[grace[0]], pronountype[grace[1]], person[grace[2]],
|
||||||
|
gender[grace[3]], number[grace[4]], pronouncase[grace[5]],
|
||||||
|
number[grace[6]]
|
||||||
|
]
|
||||||
|
|
||||||
|
if grace[0] == 'A':
|
||||||
|
return [
|
||||||
|
wordtype[grace[0]], adjtype[grace[1]], degree[grace[2]],
|
||||||
|
gender[grace[3]], number[grace[4]]
|
||||||
|
]
|
||||||
|
if grace[0] == 'D':
|
||||||
|
return [
|
||||||
|
wordtype[grace[0]], detertypes[grace[1]], person[grace[2]],
|
||||||
|
gender[grace[3]], number[grace[4]], number[grace[5]],
|
||||||
|
deternature[grace[6]]
|
||||||
|
]
|
||||||
|
if grace[0] == 'R':
|
||||||
|
return [wordtype[grace[0]], advtypes[grace[1]], degree[grace[2]]]
|
||||||
|
if grace[0] == 'S':
|
||||||
|
return [wordtype[grace[0]], adptypes[grace[1]]]
|
||||||
|
if grace[0] == 'C':
|
||||||
|
return [wordtype[grace[0]], conjtypes[grace[1]]]
|
||||||
|
return [wordtype[grace[0]]]
|
||||||
|
|
||||||
|
|
||||||
|
def init_dictionary():
|
||||||
|
"""Loads the language's dictionary, its format being a comma-separated CSV,
|
||||||
|
in its first collumn the word itself, in the second semicolon-separated
|
||||||
|
GRACE definition of the word, and in the third collumn the word’s X-SAMPA
|
||||||
|
phonetics
|
||||||
|
"""
|
||||||
|
with open('dico.dictfr', newline='') as csvfile:
|
||||||
|
spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
|
||||||
|
for row in spamreader:
|
||||||
|
grammar = row[1]
|
||||||
|
if grammar.find(';') != -1:
|
||||||
|
grammar = grammar.split(';')
|
||||||
|
else:
|
||||||
|
grammar = [grammar]
|
||||||
|
dictionary[row[0]] = Word(row[0], grammar, row[2])
|
||||||
|
|
||||||
|
|
||||||
|
def clean_txt(txt):
|
||||||
|
"""This function cleans string given as argument"""
|
||||||
|
|
||||||
|
# Sometimes some words are splitted, this regex re-merge them
|
||||||
|
txt = word_breaks.sub('', txt)
|
||||||
|
|
||||||
|
# All of :;,'"- characters will be replaced by a space
|
||||||
|
txt = intermediate_punctuation.sub(' ', txt)
|
||||||
|
|
||||||
|
# All whites characters groups (\n, \r, \t & \s) will be replaced by a
|
||||||
|
# single space
|
||||||
|
txt = whites.sub(' ', txt)
|
||||||
|
|
||||||
|
# All whites characters between ending points (!, ? or .) will be removed
|
||||||
|
txt = blanks_between_points.sub('', txt)
|
||||||
|
|
||||||
|
return txt
|
||||||
|
|
||||||
|
|
||||||
|
def filter_text(txt):
|
||||||
|
"""This function constructs a list of filtered sentences. Each filtered
|
||||||
|
sentence is itself a list of strings. Each string is a word found in the
|
||||||
|
dictionary. In the same time a dict is built, containing selected words as
|
||||||
|
keys and their frequencies as values."""
|
||||||
|
filtered_text = []
|
||||||
|
for sentence in filter(None, ending_punctuation.split(clean_txt(txt))):
|
||||||
|
word_list = []
|
||||||
|
for word in filter(None, sentence.split()):
|
||||||
|
lower_word = word.lower()
|
||||||
|
if lower_word not in dictionary:
|
||||||
|
dictionary[lower_word] = Word(lower_word, ["?"], "")
|
||||||
|
word_list.append(lower_word)
|
||||||
|
dictionary[lower_word].increase_frequency()
|
||||||
|
if word_list != []:
|
||||||
|
filtered_text.append(word_list)
|
||||||
|
return filtered_text
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_output(splitted_txt):
|
||||||
|
text = ET.Element("text")
|
||||||
|
for elem_sentence in splitted_txt:
|
||||||
|
sentence = ET.SubElement(text, "sentence")
|
||||||
|
for elem_word in elem_sentence:
|
||||||
|
word = dictionary[elem_word]
|
||||||
|
# First GRACE description used
|
||||||
|
grace = word.get_grace()[0]
|
||||||
|
grace_details = graceToString(grace)
|
||||||
|
try:
|
||||||
|
if grace[0] == 'N':
|
||||||
|
ET.SubElement(
|
||||||
|
sentence,
|
||||||
|
"word",
|
||||||
|
frequency=str(word.get_frequency()),
|
||||||
|
wordclass=grace_details[0],
|
||||||
|
wordtype=grace_details[1],
|
||||||
|
gender=grace_details[2],
|
||||||
|
number=grace_details[3]).text = word.get_word()
|
||||||
|
elif grace[0] == 'V':
|
||||||
|
ET.SubElement(
|
||||||
|
sentence,
|
||||||
|
"word",
|
||||||
|
frequency=str(word.get_frequency()),
|
||||||
|
wordclass=grace_details[0],
|
||||||
|
wordtype=grace_details[1],
|
||||||
|
mood=grace_details[2],
|
||||||
|
tense=grace_details[3],
|
||||||
|
person=grace_details[4],
|
||||||
|
number=grace_details[5],
|
||||||
|
gender=grace_details[6]).text = word.get_word()
|
||||||
|
elif grace[0] == 'P':
|
||||||
|
ET.SubElement(
|
||||||
|
sentence,
|
||||||
|
"word",
|
||||||
|
frequency=str(word.get_frequency()),
|
||||||
|
wordclass=grace_details[0],
|
||||||
|
wordtype=grace_details[1],
|
||||||
|
person=grace_details[2],
|
||||||
|
gender=grace_details[3],
|
||||||
|
number=grace_details[4],
|
||||||
|
case=grace_details[5],
|
||||||
|
possessor=grace_details[6]).text = word.get_word()
|
||||||
|
elif grace[0] == 'A':
|
||||||
|
ET.SubElement(
|
||||||
|
sentence,
|
||||||
|
"word",
|
||||||
|
frequency=str(word.get_frequency()),
|
||||||
|
wordclass=grace_details[0],
|
||||||
|
wordtype=grace_details[1],
|
||||||
|
degree=grace_details[2],
|
||||||
|
gender=grace_details[3],
|
||||||
|
number=grace_details[4]).text = word.get_word()
|
||||||
|
elif grace[0] == 'D':
|
||||||
|
ET.SubElement(
|
||||||
|
sentence,
|
||||||
|
"word",
|
||||||
|
frequency=str(word.get_frequency()),
|
||||||
|
wordclass=grace_details[0],
|
||||||
|
wordtype=grace_details[1],
|
||||||
|
person=grace_details[2],
|
||||||
|
gender=grace_details[3],
|
||||||
|
number=grace_details[4],
|
||||||
|
possessor=grace_details[5],
|
||||||
|
nature=grace_details[6]).text = word.get_word()
|
||||||
|
elif grace[0] == 'R':
|
||||||
|
ET.SubElement(
|
||||||
|
sentence,
|
||||||
|
"word",
|
||||||
|
frequency=str(word.get_frequency()),
|
||||||
|
wordclass=grace_details[0],
|
||||||
|
wordtype=grace_details[1],
|
||||||
|
degree=grace_details[2]).text = word.get_word()
|
||||||
|
elif grace[0] == 'S' or grace[0] == 'C':
|
||||||
|
ET.SubElement(
|
||||||
|
sentence,
|
||||||
|
"word",
|
||||||
|
frequency=str(word.get_frequency()),
|
||||||
|
wordclass=grace_details[0],
|
||||||
|
wordtype=grace_details[1]).text = word.get_word()
|
||||||
|
else:
|
||||||
|
ET.SubElement(
|
||||||
|
sentence,
|
||||||
|
"word",
|
||||||
|
frequency=str(word.get_frequency()),
|
||||||
|
wordclass=str(
|
||||||
|
grace_details[0])).text = word.get_word()
|
||||||
|
except IndexError:
|
||||||
|
print("Failed on " + word.get_word() + "\tGRACE:" + grace)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# main program #
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# arg parse ###################################################################
|
||||||
|
if len(sys.argv) < 2 or filename_end.findall(sys.argv[1]) == []:
|
||||||
|
exit("Usage: ./parser.py *.txt")
|
||||||
|
|
||||||
|
# load files ##################################################################
|
||||||
|
print("Loading input file...", end='')
|
||||||
|
with open(sys.argv[1], 'r') as txt_file:
|
||||||
|
txt = txt_file.read()
|
||||||
|
print("\rInput file loaded ")
|
||||||
|
print("Loading dictionary...", end='')
|
||||||
|
init_dictionary()
|
||||||
|
print("\r" + str(len(dictionary)) + " words loaded")
|
||||||
|
|
||||||
|
# now comes the parsing work ##################################################
|
||||||
|
|
||||||
|
filtered_text = filter_text(txt)
|
||||||
|
root = prepare_output(filtered_text)
|
||||||
|
tree = ET.ElementTree(root)
|
||||||
|
tree.write(
|
||||||
|
filename_end.sub('xml', sys.argv[1]),
|
||||||
|
xml_declaration=True,
|
||||||
|
encoding='utf-8',
|
||||||
|
method="xml")
|
||||||
|
exit(0)
|
Loading…
Reference in New Issue
Block a user