forgot to set up a git repo, so here it is
This commit is contained in:
commit
1245552e24
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
*.xml
|
1022663
dico.dictfr
Normal file
1022663
dico.dictfr
Normal file
File diff suppressed because it is too large
Load Diff
14247
les_miserables_tomeI.txt
Normal file
14247
les_miserables_tomeI.txt
Normal file
File diff suppressed because it is too large
Load Diff
475
parser.py
Executable file
475
parser.py
Executable file
@ -0,0 +1,475 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
###############################################################################
|
||||
# imports #
|
||||
###############################################################################
|
||||
import csv
|
||||
import re
|
||||
import sys
|
||||
import xml.etree.cElementTree as ET
|
||||
|
||||
###############################################################################
|
||||
# Regex #
|
||||
###############################################################################
|
||||
|
||||
# Sometimes some words are splitted, this regex re-merge them.
|
||||
blanks_between_points = re.compile('(?<=[\!\?\.]) (?=[\!\?\.])')
|
||||
|
||||
# This regex will find groups of ending punctuation (!, ? or .)
|
||||
# It will be used to splt text in sentences
|
||||
ending_punctuation = re.compile('[\!\?\.]+')
|
||||
|
||||
# This regex checks that end of argument given to this program contains at
|
||||
# least one alphanumeric character followed by ".txt" characters. It is use
|
||||
# twice:
|
||||
# - Argument conformity verification
|
||||
# - Resulting file creation
|
||||
# ("txt" characters of argument are replaced by "xml")
|
||||
filename_end = re.compile('(?<=\w\.)txt$')
|
||||
|
||||
# This regex is used to replace -, ", ', ;, : & , characters (eventually
|
||||
# grouped) by a single space
|
||||
intermediate_punctuation = re.compile('[\,\;\:\"\'\-]+')
|
||||
|
||||
# Goal of this regex is to replace groups of whites characters
|
||||
# (\n, \r, \t, \s) by single spaces
|
||||
whites = re.compile('[\n\r\t ]+')
|
||||
|
||||
# Sometimes some words are splitted, this regex re-merge them
|
||||
word_breaks = re.compile('\-[\n\r]+')
|
||||
|
||||
###############################################################################
|
||||
# global variables #
|
||||
###############################################################################
|
||||
|
||||
french_chars = "abcdefghijklmnopqrstuvwxyzàâæÇçéèêëîïôœùûüÿ"
|
||||
|
||||
# input text to parse
|
||||
txt = ""
|
||||
|
||||
# dictionary of words loaded from linguistic dictionary
|
||||
dictionary = {}
|
||||
|
||||
wordtype = {
|
||||
'N': "noun",
|
||||
'V': "verb",
|
||||
'P': "pronoun",
|
||||
'A': "adjective",
|
||||
'D': "determiner",
|
||||
'R': "adverb",
|
||||
'S': "adposition",
|
||||
'C': "conjonction",
|
||||
'I': "interjection",
|
||||
'X': "residual",
|
||||
'F': "ponctuation",
|
||||
'?': "unknown"
|
||||
}
|
||||
|
||||
nountypes = {'c': "common", 'p': "proper", 'k': "cardinal", '-': "None"}
|
||||
|
||||
verbtypes = {'m': "main", 'a': "auxiliary", '-': "None"}
|
||||
|
||||
verbmood = {
|
||||
'i': "indicative",
|
||||
's': "subjunctive",
|
||||
'm': "imperative",
|
||||
'c': "conditional",
|
||||
'n': "infinitive",
|
||||
'p': "participle",
|
||||
'-': "None"
|
||||
}
|
||||
|
||||
verbtense = {
|
||||
'p': "present",
|
||||
'i': "imperfect",
|
||||
'f': "future",
|
||||
's': "past",
|
||||
'-': "None"
|
||||
}
|
||||
|
||||
adjtype = {
|
||||
'f': "qualifiative",
|
||||
'o': "ordinal",
|
||||
'k': "cardinal",
|
||||
'i': "indefinite",
|
||||
's': "possessive",
|
||||
'-': "None"
|
||||
}
|
||||
|
||||
degree = {'p': "positive", 'c': "comparative", 'n': "negative", '-': "None"}
|
||||
|
||||
pronountype = {
|
||||
'p': "personal",
|
||||
'd': "demanstrative",
|
||||
'i': "indefinite",
|
||||
's': "possessive",
|
||||
't': "interrogative",
|
||||
'r': "relative",
|
||||
'x': "reflexive",
|
||||
'k': "cardinal",
|
||||
'-': "None"
|
||||
}
|
||||
|
||||
pronouncase = {
|
||||
'n': "nominative",
|
||||
'a': "accusative",
|
||||
'd': "dative",
|
||||
'o': "oblique",
|
||||
'g': "genitive",
|
||||
'-': "None"
|
||||
}
|
||||
|
||||
detertypes = {
|
||||
'a': "article",
|
||||
'd': "demonstrative",
|
||||
's': "possessive",
|
||||
'i': "indefinite",
|
||||
't': "iter-excl",
|
||||
'r': "relative",
|
||||
'k': "cardinal",
|
||||
'-': "None"
|
||||
}
|
||||
|
||||
deternature = {'d': "definite", 'i': "indefinite", '-': "None"}
|
||||
|
||||
advtypes = {'g': "general", 'p': "particle", 'x': "interro-excl", '-': "None"}
|
||||
|
||||
adptypes = {'p': "preposition", 'd': "deictique", '-': "None"}
|
||||
|
||||
conjtypes = {'c': "coordinative", 's': "subortinative", '-': "None"}
|
||||
|
||||
gender = {'m': "masculine", 'f': "feminine", '-': "None"}
|
||||
|
||||
number = {'s': "singular", 'p': "plural", '-': "None"}
|
||||
|
||||
person = {'1': "first", '2': "second", '3': "third", '-': "None"}
|
||||
|
||||
###############################################################################
|
||||
# classes definition #
|
||||
###############################################################################
|
||||
|
||||
|
||||
class Word:
|
||||
"""Any word found in the language's dictionary. It contains its plain
|
||||
string representation, an array of GRACE descriptions, an array of its word
|
||||
categories, its phonetics and its frequency in the working text.
|
||||
"""
|
||||
|
||||
def __init__(self, word: str, grammar, phonetics: str):
|
||||
"""Constructor of the `Word` class. It accepts as arguments a string
|
||||
(its plain string representation), an array of strings (GRACE
|
||||
description of the word) and its phonetics as string too.
|
||||
"""
|
||||
self.__word = word
|
||||
self.__grace = grammar
|
||||
self.__category = []
|
||||
for x in grammar:
|
||||
self.__category.append(x[0])
|
||||
self.__phon = phonetics
|
||||
self.__frequency = 0
|
||||
|
||||
def __str__(self):
|
||||
"""Returns stringified `Word`"""
|
||||
res = self.__word + "\t/" + self.__phon + "/"
|
||||
for elem in self.__grace:
|
||||
res += "\n\t" + graceToDebugString(elem)
|
||||
return res
|
||||
|
||||
def get_word(self):
|
||||
"""Returns plain string representation of `Word`"""
|
||||
return self.__word[:]
|
||||
|
||||
def get_grace(self):
|
||||
"""Returns array of GRACE descriptions (string) of `Word`"""
|
||||
return self.__grace[:]
|
||||
|
||||
def get_phon(self):
|
||||
"""Returns phonetics of `Word` as string"""
|
||||
return self.__phon[:]
|
||||
|
||||
def increase_frequency(self):
|
||||
"""Increace by one the frequency of `Word`"""
|
||||
self.__frequency += 1
|
||||
|
||||
def get_frequency(self):
|
||||
"""Get frequency of `Word` as number"""
|
||||
return self.__frequency
|
||||
|
||||
|
||||
###############################################################################
|
||||
# functions definition #
|
||||
###############################################################################
|
||||
|
||||
|
||||
def graceToDebugString(grace: str):
|
||||
"""Turns into a single string a GRACE description
|
||||
|
||||
For debug purposes only
|
||||
"""
|
||||
res = wordtype[grace[0]]
|
||||
if grace[0] == 'N':
|
||||
# pass
|
||||
res += " " + nountypes[grace[1]]
|
||||
res += " " + gender[grace[2]]
|
||||
res += " " + number[grace[3]]
|
||||
elif grace[0] == 'V':
|
||||
res += " " + verbtypes[grace[1]]
|
||||
res += " " + verbmood[grace[2]]
|
||||
res += " " + verbtense[grace[3]]
|
||||
res += " " + person[grace[4]]
|
||||
res += " " + number[grace[5]]
|
||||
res += " " + gender[grace[6]]
|
||||
elif grace[0] == 'P':
|
||||
res += " " + pronountype[grace[1]]
|
||||
res += " " + person[grace[2]]
|
||||
res += " " + gender[grace[3]]
|
||||
res += " " + number[grace[4]]
|
||||
res += " " + pronouncase[grace[5]]
|
||||
res += " " + number[grace[6]]
|
||||
|
||||
elif grace[0] == 'A':
|
||||
res += " " + adjtype[grace[1]]
|
||||
res += " " + degree[grace[2]]
|
||||
res += " " + gender[grace[3]]
|
||||
res += " " + number[grace[4]]
|
||||
elif grace[0] == 'D':
|
||||
res += " " + detertypes[grace[1]]
|
||||
res += " " + person[grace[2]]
|
||||
res += " " + gender[grace[3]]
|
||||
res += " " + number[grace[4]]
|
||||
res += " " + number[grace[5]]
|
||||
res += " " + deternature[grace[6]]
|
||||
elif grace[0] == 'R':
|
||||
res += " " + advtypes[grace[1]]
|
||||
res += " " + degree[grace[2]]
|
||||
elif grace[0] == 'S':
|
||||
res += " " + adptypes[grace[1]]
|
||||
elif grace[0] == 'C':
|
||||
res += " " + conjtypes[grace[1]]
|
||||
elif grace[0] == 'I':
|
||||
pass
|
||||
elif grace[0] == 'X':
|
||||
pass
|
||||
elif grace[0] == 'F':
|
||||
pass
|
||||
elif grace[0] == '?':
|
||||
pass
|
||||
return res
|
||||
|
||||
|
||||
def graceToString(grace: str):
|
||||
"""Returns an array of strings of explicit GRACE description"""
|
||||
if grace[0] == 'N':
|
||||
return [
|
||||
wordtype[grace[0]], nountypes[grace[1]], gender[grace[2]],
|
||||
number[grace[3]]
|
||||
]
|
||||
if grace[0] == 'V':
|
||||
return [
|
||||
wordtype[grace[0]], verbtypes[grace[1]], verbmood[grace[2]],
|
||||
verbtense[grace[3]], person[grace[4]], number[grace[5]],
|
||||
gender[grace[6]]
|
||||
]
|
||||
if grace[0] == 'P':
|
||||
return [
|
||||
wordtype[grace[0]], pronountype[grace[1]], person[grace[2]],
|
||||
gender[grace[3]], number[grace[4]], pronouncase[grace[5]],
|
||||
number[grace[6]]
|
||||
]
|
||||
|
||||
if grace[0] == 'A':
|
||||
return [
|
||||
wordtype[grace[0]], adjtype[grace[1]], degree[grace[2]],
|
||||
gender[grace[3]], number[grace[4]]
|
||||
]
|
||||
if grace[0] == 'D':
|
||||
return [
|
||||
wordtype[grace[0]], detertypes[grace[1]], person[grace[2]],
|
||||
gender[grace[3]], number[grace[4]], number[grace[5]],
|
||||
deternature[grace[6]]
|
||||
]
|
||||
if grace[0] == 'R':
|
||||
return [wordtype[grace[0]], advtypes[grace[1]], degree[grace[2]]]
|
||||
if grace[0] == 'S':
|
||||
return [wordtype[grace[0]], adptypes[grace[1]]]
|
||||
if grace[0] == 'C':
|
||||
return [wordtype[grace[0]], conjtypes[grace[1]]]
|
||||
return [wordtype[grace[0]]]
|
||||
|
||||
|
||||
def init_dictionary():
|
||||
"""Loads the language's dictionary, its format being a comma-separated CSV,
|
||||
in its first collumn the word itself, in the second semicolon-separated
|
||||
GRACE definition of the word, and in the third collumn the word’s X-SAMPA
|
||||
phonetics
|
||||
"""
|
||||
with open('dico.dictfr', newline='') as csvfile:
|
||||
spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
|
||||
for row in spamreader:
|
||||
grammar = row[1]
|
||||
if grammar.find(';') != -1:
|
||||
grammar = grammar.split(';')
|
||||
else:
|
||||
grammar = [grammar]
|
||||
dictionary[row[0]] = Word(row[0], grammar, row[2])
|
||||
|
||||
|
||||
def clean_txt(txt):
|
||||
"""This function cleans string given as argument"""
|
||||
|
||||
# Sometimes some words are splitted, this regex re-merge them
|
||||
txt = word_breaks.sub('', txt)
|
||||
|
||||
# All of :;,'"- characters will be replaced by a space
|
||||
txt = intermediate_punctuation.sub(' ', txt)
|
||||
|
||||
# All whites characters groups (\n, \r, \t & \s) will be replaced by a
|
||||
# single space
|
||||
txt = whites.sub(' ', txt)
|
||||
|
||||
# All whites characters between ending points (!, ? or .) will be removed
|
||||
txt = blanks_between_points.sub('', txt)
|
||||
|
||||
return txt
|
||||
|
||||
|
||||
def filter_text(txt):
|
||||
"""This function constructs a list of filtered sentences. Each filtered
|
||||
sentence is itself a list of strings. Each string is a word found in the
|
||||
dictionary. In the same time a dict is built, containing selected words as
|
||||
keys and their frequencies as values."""
|
||||
filtered_text = []
|
||||
for sentence in filter(None, ending_punctuation.split(clean_txt(txt))):
|
||||
word_list = []
|
||||
for word in filter(None, sentence.split()):
|
||||
lower_word = word.lower()
|
||||
if lower_word not in dictionary:
|
||||
dictionary[lower_word] = Word(lower_word, ["?"], "")
|
||||
word_list.append(lower_word)
|
||||
dictionary[lower_word].increase_frequency()
|
||||
if word_list != []:
|
||||
filtered_text.append(word_list)
|
||||
return filtered_text
|
||||
|
||||
|
||||
def prepare_output(splitted_txt):
|
||||
text = ET.Element("text")
|
||||
for elem_sentence in splitted_txt:
|
||||
sentence = ET.SubElement(text, "sentence")
|
||||
for elem_word in elem_sentence:
|
||||
word = dictionary[elem_word]
|
||||
# First GRACE description used
|
||||
grace = word.get_grace()[0]
|
||||
grace_details = graceToString(grace)
|
||||
try:
|
||||
if grace[0] == 'N':
|
||||
ET.SubElement(
|
||||
sentence,
|
||||
"word",
|
||||
frequency=str(word.get_frequency()),
|
||||
wordclass=grace_details[0],
|
||||
wordtype=grace_details[1],
|
||||
gender=grace_details[2],
|
||||
number=grace_details[3]).text = word.get_word()
|
||||
elif grace[0] == 'V':
|
||||
ET.SubElement(
|
||||
sentence,
|
||||
"word",
|
||||
frequency=str(word.get_frequency()),
|
||||
wordclass=grace_details[0],
|
||||
wordtype=grace_details[1],
|
||||
mood=grace_details[2],
|
||||
tense=grace_details[3],
|
||||
person=grace_details[4],
|
||||
number=grace_details[5],
|
||||
gender=grace_details[6]).text = word.get_word()
|
||||
elif grace[0] == 'P':
|
||||
ET.SubElement(
|
||||
sentence,
|
||||
"word",
|
||||
frequency=str(word.get_frequency()),
|
||||
wordclass=grace_details[0],
|
||||
wordtype=grace_details[1],
|
||||
person=grace_details[2],
|
||||
gender=grace_details[3],
|
||||
number=grace_details[4],
|
||||
case=grace_details[5],
|
||||
possessor=grace_details[6]).text = word.get_word()
|
||||
elif grace[0] == 'A':
|
||||
ET.SubElement(
|
||||
sentence,
|
||||
"word",
|
||||
frequency=str(word.get_frequency()),
|
||||
wordclass=grace_details[0],
|
||||
wordtype=grace_details[1],
|
||||
degree=grace_details[2],
|
||||
gender=grace_details[3],
|
||||
number=grace_details[4]).text = word.get_word()
|
||||
elif grace[0] == 'D':
|
||||
ET.SubElement(
|
||||
sentence,
|
||||
"word",
|
||||
frequency=str(word.get_frequency()),
|
||||
wordclass=grace_details[0],
|
||||
wordtype=grace_details[1],
|
||||
person=grace_details[2],
|
||||
gender=grace_details[3],
|
||||
number=grace_details[4],
|
||||
possessor=grace_details[5],
|
||||
nature=grace_details[6]).text = word.get_word()
|
||||
elif grace[0] == 'R':
|
||||
ET.SubElement(
|
||||
sentence,
|
||||
"word",
|
||||
frequency=str(word.get_frequency()),
|
||||
wordclass=grace_details[0],
|
||||
wordtype=grace_details[1],
|
||||
degree=grace_details[2]).text = word.get_word()
|
||||
elif grace[0] == 'S' or grace[0] == 'C':
|
||||
ET.SubElement(
|
||||
sentence,
|
||||
"word",
|
||||
frequency=str(word.get_frequency()),
|
||||
wordclass=grace_details[0],
|
||||
wordtype=grace_details[1]).text = word.get_word()
|
||||
else:
|
||||
ET.SubElement(
|
||||
sentence,
|
||||
"word",
|
||||
frequency=str(word.get_frequency()),
|
||||
wordclass=str(
|
||||
grace_details[0])).text = word.get_word()
|
||||
except IndexError:
|
||||
print("Failed on " + word.get_word() + "\tGRACE:" + grace)
|
||||
return text
|
||||
|
||||
|
||||
###############################################################################
|
||||
# main program #
|
||||
###############################################################################
|
||||
|
||||
if __name__ == "__main__":
|
||||
# arg parse ###################################################################
|
||||
if len(sys.argv) < 2 or filename_end.findall(sys.argv[1]) == []:
|
||||
exit("Usage: ./parser.py *.txt")
|
||||
|
||||
# load files ##################################################################
|
||||
print("Loading input file...", end='')
|
||||
with open(sys.argv[1], 'r') as txt_file:
|
||||
txt = txt_file.read()
|
||||
print("\rInput file loaded ")
|
||||
print("Loading dictionary...", end='')
|
||||
init_dictionary()
|
||||
print("\r" + str(len(dictionary)) + " words loaded")
|
||||
|
||||
# now comes the parsing work ##################################################
|
||||
|
||||
filtered_text = filter_text(txt)
|
||||
root = prepare_output(filtered_text)
|
||||
tree = ET.ElementTree(root)
|
||||
tree.write(
|
||||
filename_end.sub('xml', sys.argv[1]),
|
||||
xml_declaration=True,
|
||||
encoding='utf-8',
|
||||
method="xml")
|
||||
exit(0)
|
Loading…
Reference in New Issue
Block a user