Logo Search packages:      
Sourcecode: verbiste version File versions

FrenchVerbDictionary.h

/*  $Id: FrenchVerbDictionary.h,v 1.22 2005/03/13 04:03:26 sarrazip Exp $
    FrenchVerbDictionary.h - Dictionary of verbs and conjugation templates

    verbiste - French conjugation system
    Copyright (C) 2003-2005 Pierre Sarrazin <http://sarrazip.com/>

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
    as published by the Free Software Foundation; either version 2
    of the License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
    02111-1307, USA.
*/

#ifndef _H_FrenchVerbDictionary
#define _H_FrenchVerbDictionary

#include <verbiste/c-api.h>
#include <verbiste/misc-types.h>
#include <verbiste/Trie.h>

#include <libxml/xmlmemory.h>
#include <libxml/parser.h>

#include <assert.h>
#include <stdexcept>
#include <vector>
#include <set>
#include <string>


/** C++ namespace in which all of this library's names are defined. */
namespace verbiste {


/** French verbs and conjugation knowledge base.
    The text processing done by this class is case-sensitive.
*/
00047 class FrenchVerbDictionary
{
public:

    /** Load the french conjugation database.
      @param      conjugationFilename     filename of the XML document that
                              defines all the conjugation templates
      @param      verbsFilename           filename of the XML document that
                              defines all the known verbs and their
                              corresponding template
      @throws     logic_error       for invalid filename arguments,
                              unparseable or unexpected XML documents
                              (if verbs or template names are
                              mentioned, they are in Latin-1)
    */
    FrenchVerbDictionary(const std::string &conjugationFilename,
                  const std::string &verbsFilename)
                              throw (std::logic_error);

    /** Load the french conjugation database.
      Uses the default (hard-coded) location for the data filenames.
      @throws     logic_error       for invalid filename arguments,
                              unparseable or unexpected XML documents
                              (if verbs or template names are
                              mentioned, they are in Latin-1)
    */
    FrenchVerbDictionary() throw (std::logic_error);

    /** Destroys this object, but NOT the internal trie, to save time.
      CAUTION:
      The destruction of the trie currently takes too much time (2003-05-29).
      We let it leak because the destruction of this object is
      normally followed by the end of program execution anyway.
    */
    ~FrenchVerbDictionary();

    /** Returns a conjugation template specification from its name.
      @param      templateName      name of the desired template (e.g. "aim:er")
      @returns          a pointer to the TemplateSpec object if found,
                        or NULL otherwise
    */
    const TemplateSpec *getTemplate(const std::string &templateName) const;

    /** Returns an iterator for the list of conjugation templates. */
    ConjugationSystem::const_iterator beginConjugSys() const;

    /** Returns an iterator for the end of the list of conjugation templates. */
    ConjugationSystem::const_iterator endConjugSys() const;

    /** Returns the template used by a verb.
      @param      infinitive  infinitive form of the verb in Latin-1
                        (e.g., "manger", not "mange")
      @returns          the template of the form radical:termination
                        in Latin-1 (e.g., "aim:er"),
                        or NULL if the verb is unknown 
                        or if 'infinitive' is NULL
    */
    const char *getVerbTemplate(const char *infinitive) const;

    /** Returns the template used by a verb.
      @param      infinitive  infinitive form of the verb in Latin-1
                        (e.g., "manger", not "mange")
      @returns          the template of the form radical:termination
                        in Latin-1 (e.g., "aim:er"),
                        or NULL if the verb is unknown 
    */
    const char *getVerbTemplate(const std::string &infinitive) const;


    /** Returns an iterator for the list of known verbs. */
    VerbTable::const_iterator beginKnownVerbs() const;

    /** Returns an iterator for the end of the list of known verbs. */
    VerbTable::const_iterator endKnownVerbs() const;


    /** Describes an inflection according to a given conjugation template.
      If the given inflection is known to the given conjugation template,
      the list of possible modes, tenses and persons is returned.
      For example, the inflection "es" in the "aim:er" template
      can be the 2nd person singular of the indicative present
      ("tu aimes")
      or the 2nd person singular of the subjunctive present.
      ("que tu aimes").
      Here, two ModeTensePersonNumber objects would be in the returned vector.

      @param      templateName      name of the conjugation template to use
                        (e.g., "aim:er")
      @param      inflection  inflection to be described
                        (e.g., "erions")
      @returns          a pointer to a vector of ModeTensePersonNumber
                        objects (which must not be modified nor
                        destroyed), or NULL if the inflection was not
                        known to the template
    */
    const std::vector<ModeTensePersonNumber> *getMTPNForInflection(
                              const std::string &templateName,
                              const std::string &inflection) const;

    /** Converts an English mode name into the corresponding enumerated type.
      @param      modeName    English mode name (infinitive, indicative, etc)
      @returns          a member of the Mode enumeration
                        (INVALID_MODE if 'modeName' is not known)
    */
    static Mode convertModeName(const char *modeName);

    /** Converts an English tense name into the corresponding enumerated type.
      @param      tenseName   English tense name (present, past, etc)
      @returns          a member of the Tense enumeration
                        (INVALID_MODE if 'modeName' is not known)
    */
    static Tense convertTenseName(const char *tenseName);

    /** Analyzes a conjugated verb and finds all known possible cases.
      @param      conjugatedVerb    conjugated French verb in Latin-1
                        (e.g., "aimerions")
      @param      results           vector in which to store the inflection
                        descriptions (this vector is not emptied
                        before elements are stored in it);
                        no elements are stored in this vector
                        if the given conjugated verb is unknown
    */
    void deconjugate(const std::string &conjugatedVerb,
                  std::vector<InflectionDesc> &results);

    /** Returns the English name (in ASCII) of the given mode.
    */
    static const char *getModeName(Mode m);

    /** Returns the English name (in ASCII) of the given tense.
    */
    static const char *getTenseName(Tense t);

    /** Converts a Latin-1 string to lower-case.
      @param      latin1String      ISO-8859-1 character string to be converted
      @returns          lower-case version of the character string
    */
    std::string tolowerLatin1(const std::string &latin1String) const;

    /** Converts a Latin-1 string into UTF-8.
      @param      latin1String      ISO-8859-1 character string to be converted
      @returns          UTF-8 version of the character string
      @throws     int         errno value set by iconv(3)
    */
    std::string latin1ToUTF8(const std::string &latin1String) const throw(int);

    /** Converts a UTF-8 string into Latin-1.
      @param      utf8string  UTF-8 character string to be converted
      @returns          a ISO-8859-1 version of the character string
      @throws     int         errno value set by iconv(3)
    */
    std::string utf8ToLatin1(const std::string &utf8String) const throw(int);

    /** Converts the strings in the designated vector from UTF-8 to Latin-1.
      @param      vec         structure whose strings are to be converted
      @throws     int         errno value set by iconv(3)
    */
    void utf8ToLatin1(std::vector<InflectionDesc> &vec) const throw(int);

    /** Converts the strings in the designated vector from UTF-8 to Latin-1.
      @param      vec         structure whose strings are to be converted
      @throws     int         errno value set by iconv(3)
    */
    void utf8ToLatin1(
            std::vector<std::vector<std::string> > &vec) const throw(int);

    /**     Returns the content of an XML node in Latin-1.
      @param      doc         the XML document
      @param      node        the node of the XML document whose contents
                        are to be extracted
      @returns          a Latin-1 string representing the contents
                        of the node; this string is empty the
                        requested node does not exist
      @throws     int         errno value set by iconv(3), in the case of a
                        UTF-8 to Latin-1 conversion error
    */
    std::string getLatin1XmlNodeText(
                  xmlDocPtr doc, xmlNodePtr node) throw(int);

    /**     Returns the content of an XML property in Latin-1.
      @param      node        the node of the XML document
      @param      propName    the name of the property to extract
      @returns          a Latin-1 string representing the contents
                        of the property; this string is empty the
                        requested property does not exist
      @throws     int         errno value set by iconv(3), in the case of a
                        UTF-8 to Latin-1 conversion error
    */
    std::string getLatin1XmlProp(
                  xmlNodePtr node, const char *propName) throw(int);

    /** Gets the radical part of an infinitive, according to a template name.
      @param      infinitive  infinitive whose radical is requested
      @param      templateName      name of the conjugation template that applies
      @returns          a prefix of 'infinitive'
      @throws     logic_error the template name is invalid (no ':' found)
    */
    static std::string getRadical(
                const std::string &infinitive,
                const std::string &templateName) throw(std::logic_error);

    /** Generates the conjugation of a verb for a given mode and tense.
      The generated words are complete, they are not just inflections.
      @param      radical           radical part of the verb to conjugate
      @param      templ       conjugation template to apply
      @param      mode        mode to use
      @param      tense       tense to use
      @param      dest        vector of vectors of strings into which to
                        store the results; the result is a list of
                        "persons", and a person is a list of
                        "inflections"
      @param      includePronouns   put pronouns before conjugated verbs in the
                        modes where pronouns are used
      @param      aspirateH   notifies this function that the verb starts
                        with an aspirate h (e.g., "hacher", which
                        gives "je hache") instead of a silent h
                        (e.g., "habiter", which gives "j'habite")
    */
    static void generateTense(const std::string &radical,
                  const TemplateSpec &templ,
                  Mode mode,
                  Tense tense,
                  std::vector< std::vector<std::string> > &dest,
                  bool includePronouns = false,
                  bool aspirateH = false) throw();

    /** Indicates if the given verb starts with an aspirate h.
      An aspirate h means that one cannot make a contraction or liaison
      in front of the word.  For example, "hacher" has an aspirate h
      and this means that one says "je hache" and not "j'hache".
      The verb "habiter" however does not have an aspirate h, so one
      says "j'habite" and not "je habite".
    */
    bool isVerbStartingWithAspirateH(
                        const std::string &infinitive) const throw();

private:

    /** Trie that contains all known verb radicals.
      The associated information is a list of template names
      that can apply to the radical.
      The verb radicals and the template names are stored in Latin-1.
    */
00290     class VerbTrie : public Trie< std::vector<std::string> >
    {
    public:
      const FrenchVerbDictionary &fvd;
      std::vector<InflectionDesc> *results;

      /** Constructs a trie that keeps a reference to the dictionary.
          @param  d     reference to the verb dictionary
      */
00299       VerbTrie(const FrenchVerbDictionary &d)
        : Trie< std::vector<std::string> >(true),
          fvd(d),
          results(NULL)
      {
      }

      /** Callback invoked by the Trie<>::get() method.
          This callback will be called for each prefix of the searched
          string that corresponds to the radical of a known verb.
          Stores data in the vector<InflectionDesc> designated by
          the last call to setDestination().
          @param  conjugatedVerb    the searched string
          @param  index       length of the prefix
          @param  templateList      list of conjugation templates that
                              might apply to the conjugated verb
      */
      virtual void onFoundPrefixWithUserData(
                  const std::string &conjugatedVerb,
                  std::string::size_type index,
                  const std::vector<std::string> *templateList) const
                                                throw();

      /** Sets the destination vector in which callback() stores results.
          When the Trie<>::get() method is called on this object,
          it may invoke the callback() virtual method.
          callback() will store any results in the vector designated here.
          After calling get(), iterate through the vector to obtain
          the possible inflections of the conjugated verb.
          @param  d     destination vector designated as the
                        repository for results (may be NULL)
      */
00331       void setDestination(std::vector<InflectionDesc> *d)
      {
          results = d;
      }
    };

    friend class VerbTrie;


    ConjugationSystem conjugSys;
    VerbTable knownVerbs;
    std::set<std::string> aspirateHVerbs;
    InflectionTable inflectionTable;
    iconv_t toUTF8;
    iconv_t toLatin1;
    char latin1TolowerTable[256];
    VerbTrie verbTrie;


    void init(const std::string &conjugationFilename,
                  const std::string &verbsFilename)
                              throw (std::logic_error);
    void loadConjugationDatabase(const char *conjugationFilename)
                              throw (std::logic_error);
    void loadVerbDatabase(const char *verbsFilename)
                              throw (std::logic_error);
    void readConjugation(xmlDocPtr doc) throw(std::logic_error);
    static void generateOtherPastParticiple(const char *mascSing,
                              std::vector<std::string> &dest);
    void readVerbs(xmlDocPtr doc) throw(std::logic_error);

    // Forbidden operations:
    FrenchVerbDictionary(const FrenchVerbDictionary &x);
    FrenchVerbDictionary &operator = (const FrenchVerbDictionary &x);
};


}  // namespace verbiste


#endif  /* _H_FrenchVerbDictionary */

Generated by  Doxygen 1.6.0   Back to index