Logo Search packages:      
Sourcecode: verbiste version File versions

FrenchVerbDictionary.cpp

/*  $Id: FrenchVerbDictionary.cpp,v 1.18 2005/03/13 04:03:26 sarrazip Exp $
    FrenchVerbDictionary.cpp - Dictionary of verbs and conjugation templates

    verbiste - French conjugation system
    Copyright (C) 2003-2005 Pierre Sarrazin <http://sarrazip.com/>

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
    as published by the Free Software Foundation; either version 2
    of the License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
    02111-1307, USA.
*/

#include "FrenchVerbDictionary.h"

#include <iostream>
#include <errno.h>

using namespace std;
using namespace verbiste;


class AutoDoc
{
public:
    AutoDoc(xmlDocPtr d) : doc(d) {}
    ~AutoDoc() { if (doc != NULL) xmlFreeDoc(doc); }
    xmlDocPtr get() const { return doc; }
    bool operator ! () const { return doc == NULL; }
private:
    xmlDocPtr doc;
};


class AutoString
{
public:
    AutoString(xmlChar *s) : str(s) {}
    ~AutoString() { if (str != NULL) xmlFree(str); }
    xmlChar *get() const { return str; }
    bool operator ! () const { return str == NULL; }
    size_t length() const { return str == NULL ? 0 : strlen((char *) str); }
private:
    xmlChar *str;
};


inline
const xmlChar *
XMLCHAR(const char *s)
{
    return (const xmlChar *) s;
}


inline
int
equal(const xmlChar *a, const char *b)
{
    return xmlStrcmp(a, XMLCHAR(b)) == 0;
}


inline
int
different(const xmlChar *a, const char *b)
{
    return !equal(a, b);
}


inline
xmlChar *
getProp(xmlNodePtr node, const char *propName)
{
    return xmlGetProp(node, XMLCHAR(propName));
}


inline
xmlChar *
getString(xmlDocPtr doc, xmlNodePtr node)
{
    return xmlNodeListGetString(doc, node, 1);
}


inline
string
operator + (const AutoString &a, const string &b)
{
    return (char *) a.get() + b;
}


inline
string
operator + (const string &a, const AutoString &b)
{
    return a + (char *) b.get();
}


inline
Mode
convertModeName(const xmlChar *modeName)
{
    return FrenchVerbDictionary::convertModeName((char *) modeName);
}


inline
Tense
convertTenseName(const xmlChar *tenseName)
{
    return FrenchVerbDictionary::convertTenseName((char *) tenseName);
}


00129 FrenchVerbDictionary::FrenchVerbDictionary() throw (logic_error)
  : conjugSys(),
    knownVerbs(),
    inflectionTable(),
    verbTrie(*this)
{
    const char *libdatadir = NULL;
    #ifndef NDEBUG
    libdatadir = getenv("LIBDATADIR");
    #endif
    if (libdatadir == NULL)
      libdatadir = LIBDATADIR;

    string conjFN  = libdatadir + string("/") + "conjugation-fr.xml";
    string verbsFN = libdatadir + string("/") + "verbs-fr.xml";

    init(conjFN, verbsFN);
}


FrenchVerbDictionary::FrenchVerbDictionary(
                        const string &conjugationFilename,
                        const string &verbsFilename)
                              throw (logic_error)
  : conjugSys(),
    knownVerbs(),
    inflectionTable(),
    verbTrie(*this)
{
    init(conjugationFilename, verbsFilename);
}


void
FrenchVerbDictionary::init(const string &conjugationFilename,
                      const string &verbsFilename)
                              throw (logic_error)
{
    toUTF8 = iconv_open("UTF-8", "ISO-8859-1");
    if (toUTF8 == (iconv_t) -1)
      throw logic_error("conversion from ISO-8859-1 to UTF-8 not supported");
    toLatin1 = iconv_open("ISO-8859-1", "UTF-8");
    if (toLatin1 == (iconv_t) -1)
      throw logic_error("conversion from UTF-8 to ISO-8859-1 not supported");

    {
      for (int i = 0; i < 0xC0; i++)
          latin1TolowerTable[i] = tolower(char(i));
      for (int i = 0xC0; i < 0xE0; i++)
          latin1TolowerTable[i] = char(i + 0x20);
      for (int i = 0xE0; i < 0x100; i++)
          latin1TolowerTable[i] = char(i);
    }

    loadConjugationDatabase(conjugationFilename.c_str());
    loadVerbDatabase(verbsFilename.c_str());
}


void
FrenchVerbDictionary::loadConjugationDatabase(
                        const char *conjugationFilename)
                              throw (logic_error)
{
    if (conjugationFilename == NULL)
      throw invalid_argument("conjugationFilename");

    AutoDoc conjDoc = xmlParseFile(conjugationFilename);
    if (!conjDoc)
      throw logic_error("could not parse " + string(conjugationFilename));

    readConjugation(conjDoc.get());
}


void
FrenchVerbDictionary::loadVerbDatabase(
                        const char *verbsFilename)
                              throw (logic_error)
{
    if (verbsFilename == NULL)
      throw invalid_argument("verbsFilename");

    AutoDoc verbsDoc = xmlParseFile(verbsFilename);
    if (!verbsDoc)
      throw logic_error("could not parse " + string(verbsFilename));

    readVerbs(verbsDoc.get());
}


void
FrenchVerbDictionary::readConjugation(xmlDocPtr doc) throw(logic_error)
{
    xmlNodePtr rootNodePtr = xmlDocGetRootElement(doc);

    if (rootNodePtr == NULL)
      throw logic_error("empty conjugation document");

    if (different(rootNodePtr->name, "conjugation-fr"))
      throw logic_error("wrong top node in conjugation document");

    for (xmlNodePtr templ = rootNodePtr->xmlChildrenNode;
                  templ != NULL;
                  templ = templ->next)
    {
      if (different(templ->name, "template"))  // ignore junk between tags
          continue;

      string tname = getLatin1XmlProp(templ, "name");
      if (tname.empty())
          throw logic_error("missing template name attribute");

      // The template name is the root and the termination,
      // with a colon in between.  For example, "pla:cer".

      if (tname.find(':') == string::npos)
          throw logic_error("missing colon in template name");

      // The use of the [] operator creates an empty conjugation
      // template spec, to which we keep a reference:

      TemplateSpec &theTemplateSpec = conjugSys[tname];

      // Same idea:

      TemplateInflectionTable &ti = inflectionTable[tname];

      // For each mode (e.g., infinitive, indicative, conditional, etc):
      for (xmlNodePtr mode = templ->xmlChildrenNode;
                      mode != NULL;
                      mode = mode->next)
      {
          if (equal(mode->name, "text"))  // any text in this node is ignored
            continue;

          Mode theMode = ::convertModeName(mode->name);
          ModeSpec &theModeSpec = theTemplateSpec[theMode];

          // For each tense in the mode:
          for (xmlNodePtr tense = mode->xmlChildrenNode;
                      tense != NULL;
                      tense = tense->next)
          {
            if (equal(tense->name, "text"))
                continue;

            Tense theTense = ::convertTenseName(tense->name);
            TenseSpec &theTenseSpec = theModeSpec[theTense];

            // For each person in the tense:
            int personCounter = 0;
            for (xmlNodePtr person = tense->xmlChildrenNode;
                        person != NULL;
                        person = person->next)
            {
                if (different(person->name, "p"))
                  continue;

                personCounter++;

                theTenseSpec.push_back(PersonSpec());
                PersonSpec &thePersonSpec = theTenseSpec.back();

                // For each inflection for this person:
                // (Note that most persons of most verbs have only
                // on inflection.)
                for (xmlNodePtr inf = person->xmlChildrenNode;
                              inf != NULL;
                              inf = inf->next)
                {
                  string variant = getLatin1XmlNodeText(
                                        doc, inf->xmlChildrenNode);
                  thePersonSpec.push_back(variant);

                  ModeTensePersonNumber mtpn(
                        (char *) mode->name,
                        (char *) tense->name,
                        personCounter);
                  ti[variant].push_back(mtpn);
                }
            }
          }
      }
    }
}


string
00318 FrenchVerbDictionary::getLatin1XmlNodeText(xmlDocPtr doc, xmlNodePtr node)
                                                throw(int)
{
    xmlChar *s = getString(doc, node);
    if (s == NULL)
      return string();
    return utf8ToLatin1((char *) s);
}


string
00329 FrenchVerbDictionary::getLatin1XmlProp(xmlNodePtr node, const char *propName)
                                                throw(int)
{
    xmlChar *s = getProp(node, propName);
    if (s == NULL)
      return string();
    return utf8ToLatin1((char *) s);
}


void
FrenchVerbDictionary::readVerbs(xmlDocPtr doc) throw(logic_error)
{
    xmlNodePtr rootNodePtr = xmlDocGetRootElement(doc);

    if (rootNodePtr == NULL)
      throw logic_error("empty verbs document");

    if (different(rootNodePtr->name, "verbs-fr"))
      throw logic_error("wrong top node in verbs document");

    for (xmlNodePtr v = rootNodePtr->xmlChildrenNode; v != NULL; v = v->next)
    {
      if (equal(v->name, "text"))
          continue;

      xmlNodePtr i = v->xmlChildrenNode;
      if (i == NULL || i->xmlChildrenNode == NULL)
          throw logic_error("missing <i> node");

      string infinitive =
                  getLatin1XmlNodeText(doc, i->xmlChildrenNode);
      if (infinitive.empty())
          throw logic_error("empty <i> node");
      size_t lenInfinitive = infinitive.length();

      if (i->next == NULL)
          throw logic_error("unexpected end after <i> node");

      xmlNodePtr t = i->next->next;
      if (t == NULL)
          throw logic_error("missing <t> node");

      string tname = getLatin1XmlNodeText(doc, t->xmlChildrenNode);
      if (tname.empty())
          throw logic_error("empty <t> node");
      string::size_type posColon = tname.find(':');
      if (posColon == string::npos)
          throw logic_error("missing colon in <t> node");
      if (conjugSys.find(tname) == conjugSys.end())
          throw logic_error("unknown template name: " + tname);

      knownVerbs[infinitive] = tname;

      // <aspirate-h>: If this verb starts with an aspirate h, remember it:
      if (t->next != NULL && t->next->next != NULL)
          aspirateHVerbs.insert(infinitive);

      // Insert the verb in the trie.
      // A list of template names is associated to each verb in this trie.

      size_t lenTermination = tname.length() - posColon - 1;
      assert(lenTermination > 0);
      assert(lenInfinitive >= lenTermination);

      string verbRadical(infinitive, 0, lenInfinitive - lenTermination);

      vector<string> **templateListPtr =
                        verbTrie.getUserDataPointer(verbRadical);
      assert(templateListPtr != NULL);
      if (*templateListPtr == NULL)
      {
          //cerr << "new verbRadical: '" << verbRadical << "'\n";
          *templateListPtr = new vector<string>();
      }
      (*templateListPtr)->push_back(tname);
    }
}


00409 FrenchVerbDictionary::~FrenchVerbDictionary()
{
    iconv_close(toLatin1);
    iconv_close(toUTF8);
}


const TemplateSpec *
FrenchVerbDictionary::getTemplate(const string &templateName) const
{
    ConjugationSystem::const_iterator it = conjugSys.find(templateName);
    if (it == conjugSys.end())
      return NULL;
    return &it->second;
}


ConjugationSystem::const_iterator
00427 FrenchVerbDictionary::beginConjugSys() const
{
    return conjugSys.begin();
}


ConjugationSystem::const_iterator
00434 FrenchVerbDictionary::endConjugSys() const
{
    return conjugSys.end();
}


const char *
00441 FrenchVerbDictionary::getVerbTemplate(const char *infinitive) const
{
    if (infinitive == NULL)
      return NULL;
    VerbTable::const_iterator it = knownVerbs.find(infinitive);
    if (it == knownVerbs.end())
      return NULL;
    return it->second.c_str();
}


const char *
FrenchVerbDictionary::getVerbTemplate(const string &infinitive) const
{
    return getVerbTemplate(infinitive.c_str());
}


VerbTable::const_iterator
00460 FrenchVerbDictionary::beginKnownVerbs() const
{
    return knownVerbs.begin();
}


VerbTable::const_iterator
00467 FrenchVerbDictionary::endKnownVerbs() const
{
    return knownVerbs.end();
}


const std::vector<ModeTensePersonNumber> *
00474 FrenchVerbDictionary::getMTPNForInflection(
                        const std::string &templateName,
                        const std::string &inflection) const
{
    InflectionTable::const_iterator i = inflectionTable.find(templateName);
    if (i == inflectionTable.end())
      return NULL;
    const TemplateInflectionTable &ti = i->second;
    TemplateInflectionTable::const_iterator j = ti.find(inflection);
    if (j == ti.end())
      return NULL;
    return &j->second;
}


/*static*/
Mode
00491 FrenchVerbDictionary::convertModeName(const char *modeName)
{
    Mode mode = INVALID_MODE;
    if (modeName == NULL)
      ;
    else if (strcmp(modeName, "infinitive") == 0)
      mode = INFINITIVE_MODE;
    else if (strcmp(modeName, "indicative") == 0)
      mode = INDICATIVE_MODE;
    else if (strcmp(modeName, "conditional") == 0)
      mode = CONDITIONAL_MODE;
    else if (strcmp(modeName, "subjunctive") == 0)
      mode = SUBJUNCTIVE_MODE;
    else if (strcmp(modeName, "imperative") == 0)
      mode = IMPERATIVE_MODE;
    else if (strcmp(modeName, "participle") == 0)
      mode = PARTICIPLE_MODE;
    return mode;
}


/*static*/
Tense
00514 FrenchVerbDictionary::convertTenseName(const char *tenseName)
{
    Tense tense = INVALID_TENSE;
    if (tenseName == NULL)
      ;
    else if (strcmp(tenseName, "infinitive-present") == 0)
      tense = PRESENT_TENSE;
    else if (strcmp(tenseName, "present") == 0)
      tense = PRESENT_TENSE;
    else if (strcmp(tenseName, "imperfect") == 0)
      tense = IMPERFECT_TENSE;
    else if (strcmp(tenseName, "future") == 0)
      tense = FUTURE_TENSE;
    else if (strcmp(tenseName, "simple-past") == 0)
      tense = PAST_TENSE;
    else if (strcmp(tenseName, "imperative-present") == 0)
      tense = PRESENT_TENSE;
    else if (strcmp(tenseName, "present-participle") == 0)
      tense = PRESENT_TENSE;
    else if (strcmp(tenseName, "past-participle") == 0)
      tense = PAST_TENSE;
    else if (strcmp(tenseName, "past") == 0)
      tense = PAST_TENSE;
    return tense;
}


void
FrenchVerbDictionary::deconjugate(const string &conjugatedVerb,
                        std::vector<InflectionDesc> &results)
{
    verbTrie.setDestination(&results);
    (void) verbTrie.get(conjugatedVerb);
    verbTrie.setDestination(NULL);
}


/*virtual*/
void
FrenchVerbDictionary::VerbTrie::onFoundPrefixWithUserData(
                  const string &conjugatedVerb,
                  string::size_type index,
                  const vector<std::string> *templateList) const throw()
{
    assert(templateList != NULL);
    if (results == NULL)
      return;

    string radical(conjugatedVerb, 0, index);
    string term(conjugatedVerb, index);

    /*
      'templateList' contains the names of conjugated templates that might
      apply to the conjugated verb.  We check each of them to see if there
      is one that accepts the given termination 'term'.
    */
    for (vector<string>::const_iterator i = templateList->begin();
                                    i != templateList->end(); i++)
    {
      const string &tname = *i;
      const TemplateInflectionTable &ti =
                        fvd.inflectionTable.find(tname)->second;
      TemplateInflectionTable::const_iterator j = ti.find(term);
      if (j == ti.end())
          continue;  // template 'tname' does not accept termination 'term'

      // template 'tname' accepts 'term', so we produce some results.

      string templateTerm(tname, tname.find(':') + 1);
          // termination of the infinitive form

      const vector<ModeTensePersonNumber> &v = j->second;
          // list of mode-tense-person combinations that can correspond
          // to the conjugated verb's termination

      for (vector<ModeTensePersonNumber>::const_iterator k = v.begin();
                                        k != v.end(); k++)
      {
          const ModeTensePersonNumber &mtpn = *k;

          string infinitive = radical + templateTerm;
            // the infinitive of the conjugated verb is formed from its
            // radical part and from the termination of the template name

          results->push_back(InflectionDesc(infinitive, tname, mtpn));
            // the InflectionDesc object is an analysis of the
            // conjugated verb
      }
    }
}


/*static*/
const char *
00608 FrenchVerbDictionary::getModeName(Mode m)
{
    if (int(m) < int(INFINITIVE_MODE) || int(m) > int(PARTICIPLE_MODE))
      return NULL;

    static const char *names[] =
    {
      "infinitive", "indicative", "conditional",
      "subjunctive", "imperative", "participle"
    };

    return names[int(m) - 1];
}


/*static*/
const char *
00625 FrenchVerbDictionary::getTenseName(Tense t)
{
    if (int(t) < int(PRESENT_TENSE) || int(t) > int(FUTURE_TENSE))
      return NULL;

    static const char *names[] =
    {
      "present", "past", "imperfect", "future"
    };

    return names[int(t) - 1];
}


string
FrenchVerbDictionary::tolowerLatin1(const string &latin1String) const
{
    string result;
    for (string::size_type len = latin1String.length(), i = 0; i < len; i++)
      result += latin1TolowerTable[(unsigned char) latin1String[i]];
    return result;
}


string
FrenchVerbDictionary::latin1ToUTF8(const string &latin1String) const throw(int)
{
    size_t len = latin1String.length();
    size_t inbytesleft = len + 1;
    size_t outbytesleft = len * 2 + 1;
    char *inbuf = strcpy(new char[inbytesleft], latin1String.c_str());
    char *outbuf = new char[outbytesleft];

    char *in = inbuf;
    char *out = outbuf;
    if (iconv(toUTF8, &in, &inbytesleft, &out, &outbytesleft) == (size_t) -1)
    {
      int e = errno;
      delete [] inbuf;
      delete [] outbuf;
      throw e;
    }

    string result = outbuf;
    delete [] inbuf;
    delete [] outbuf;
    return result;
}


string
FrenchVerbDictionary::utf8ToLatin1(const string &utf8String) const throw(int)
{
    size_t len = utf8String.length();
    size_t inbytesleft = len + 1;
    size_t outbytesleft = len + 1;
    char *inbuf = strcpy(new char[inbytesleft], utf8String.c_str());
    char *outbuf = new char[outbytesleft];

    char *in = inbuf;
    char *out = outbuf;
    if (iconv(toLatin1, &in, &inbytesleft, &out, &outbytesleft) == (size_t) -1)
    {
      int e = errno;
      delete [] inbuf;
      delete [] outbuf;
      throw e;
    }

    string result = outbuf;
    delete [] inbuf;
    delete [] outbuf;
    return result;
}


void
FrenchVerbDictionary::utf8ToLatin1(vector<InflectionDesc> &vec) const throw(int)
{
    for (vector<InflectionDesc>::iterator it = vec.begin();
                              it != vec.end(); it++)
      (*it).infinitive = utf8ToLatin1((*it).infinitive);

    // The 'templateName' field is in ASCII and does not need to be converted.
}


void
FrenchVerbDictionary::utf8ToLatin1(
                  vector<vector<string> > &vec) const throw(int)
{
    for (vector<vector<string> >::iterator i = vec.begin();
                              i != vec.end(); i++)
      for (vector<string>::iterator j = (*i).begin();
                              j != (*i).end(); j++)
          *j = utf8ToLatin1(*j);
}



/*static*/
string
FrenchVerbDictionary::getRadical(
                  const string &infinitive,
                  const string &templateName) throw(logic_error)
{
    string::size_type posColon = templateName.find(':');
    if (posColon == string::npos)
      throw logic_error("no colon found in template name");

    string::size_type lenSuffix = templateName.length() - posColon - 1;
    string::size_type lenInfPrefix = infinitive.length() - lenSuffix;
    return string(infinitive, 0, lenInfPrefix);
}


/*static*/
void
FrenchVerbDictionary::generateTense(const string &radical,
                        const TemplateSpec &templ,
                        Mode mode,
                        Tense tense,
                        vector< vector<string> > &dest,
                        bool includePronouns,
                        bool aspirateH) throw()
{
    const ModeSpec &modeSpec = templ.find(mode)->second;
    const TenseSpec &tenseSpec = modeSpec.find(tense)->second;

    if (mode != INDICATIVE_MODE
          && mode != CONDITIONAL_MODE
          && mode != SUBJUNCTIVE_MODE)
      includePronouns = false;

    for (TenseSpec::const_iterator p = tenseSpec.begin();
                            p != tenseSpec.end(); p++)
    {
      dest.push_back(vector<string>());
      for (PersonSpec::const_iterator i = p->begin(); i != p->end(); i++)
      {
          string pronoun;
          string v = radical + *i;

          if (includePronouns)
          {
            size_t noPers = p - tenseSpec.begin();
            switch (noPers)
            {
                case 0:
                {
                  bool elideJe = false;
                  if (!aspirateH)
                  {
                      char init = (v.empty() ? '\0' : v[0]);
                      bool isVowelOrH = (strchr(
                            "aeiouyhAEIOUYH",
                                                init) != NULL);
                      if (isVowelOrH)
                        elideJe = true;
                  }
                  pronoun = (elideJe ? "j'" : "je ");
                  break;
                }
                case 1: pronoun = "tu "; break;
                case 2: pronoun = "il "; break;
                case 3: pronoun = "nous "; break;
                case 4: pronoun = "vous "; break;
                case 5: pronoun = "ils "; break;
            }

            if (mode == SUBJUNCTIVE_MODE)
            {
                if (noPers == 2 || noPers == 5)
                  pronoun = "qu'" + pronoun;
                else
                  pronoun = "que " + pronoun;
            }
          }

          dest.back().push_back(pronoun + v);
      }
    }
}


00810 bool FrenchVerbDictionary::isVerbStartingWithAspirateH(
                        const std::string &infinitive) const throw()
{
    return aspirateHVerbs.find(infinitive) != aspirateHVerbs.end();
}

Generated by  Doxygen 1.6.0   Back to index