-Works on tokenizer

-Addedd verb "poder" to portuguese dictionary
-Added tests
-Added sintatics, quantifiers and qualifiers
-Added tokenization for quantifiers and qualifiers
Esse commit está contido em:
Felipe Nascimento de Moura
2010-12-24 02:37:57 -02:00
commit 597d05163c
14 arquivos alterados com 305 adições e 61 exclusões
@@ -169,6 +169,9 @@ class VerbalizerTest extends PHPUnit_Framework_TestCase {
public function testToInfinitive12() {
$this->assertEquals('varrer', Verbalizer::toInfinitive('varrer'));
}
public function testToInfinitive13() {
$this->assertEquals('poder', Verbalizer::toInfinitive('pode'));
}
/**
* @todo Implement testLoadVerbs().
+2 -3
Ver Arquivo
@@ -3092,6 +3092,7 @@ pleitear
plotar
plugar
podar
poder
poetizar
polarizar
polemizar
@@ -4141,6 +4142,4 @@ ziguezaguear
zombar
zumbir
zunir
zurrar
âmbar
éter
zurrar
+17 -24
Ver Arquivo
@@ -1,14 +1,16 @@
<?php
/**
* This class will take the content to be used, and take
* it to its canonical form
* it to its canonical form.
* It extends the Inflect class, from the selected language
*
* @package cortex.analyst
* @author felipe
*/
class Canonic extends Inflect{
/**
* Takes a word to its canonic form
* Takes a word to its canonic form(singular, male form)
* @param string$word
* @return string
*/
@@ -26,36 +28,27 @@ class Canonic extends Inflect{
* @param array $content
* @return Array
*/
public function sweep(Array $content)
public function sweep()
{
$v= new Verbalizer;
$content= Mind::$content;
$newContent= Array();
foreach($content as $word)
{
if(!Verbalizer::isVerb($word) && strlen($word) > 1)
if(IgnoreForms::shouldBeIgnored($word))
continue;
if(strlen($word) > 1 && ($isVerb= Verbalizer::isVerb($word)))
{
//echo 'the canonic form of '.$word.' is '.Canonic::canonize($word);
$word= Canonic::canonize($word);
}
$word= Verbalizer::toInfinitive($word);
}elseif(false){
}else{
$word= Canonic::canonize($word);
}
$newContent[]= $word;
}
/*array_map(function ($word)
{
print_r($v);
//echo Verbalizer::toInfinitive($word);
//if(Verbalizer::isVerb($word))
//return Verbalizer::toInfinitive($word);
Canonic::canonize($word);
},
$content);
*/
print_r($newContent);
Mind::$content= $newContent;
Mind::$tokenizer= new Tokenizer();
return $newContent;
}
public function __construct()
{
}
}
?>
+228
Ver Arquivo
@@ -0,0 +1,228 @@
<?php
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
// Tokens to be used
// MT stands for MindToken
define('MT_PERIOD' , -2);
define('MT_COMA' , -1);
define('MT_VOID' , 0);
define('MT_VERB' , 1);
define('MT_SUBST' , 2);
define('MT_NONE' , 4);
define('MT_ONE' , 8);
define('MT_OR' , 16);
define('MT_MANY' , 32);
define('MT_QMUST' , 64);
define('MT_QMAY' , 128);
define('MT_QNOTNULL', 254);
define('MT_QKEY' , 564);
define('MT_ANY' , 1024);
/**
* This class will apply the most important tokens to be used.
* It also builds the main structure to compare it to avaliable
* sintatics. This structure is called SPINE
*
* @author felipe
*/
class Tokenizer {
public static $sintaticsList;
public static $quantifiers;
public static $qualifiers;
public static $spine='';
/**
* This method builds the required structure from the
* sent XML file
*
* @name loadSintatics
* @param SimpleXML $resource
* @return AssocArray
*/
public static function loadSintatics($resource)
{
while (!feof($resource))
{
$word= preg_replace('/\s/', '', fgets($resource, 4096));
self::$sintaticsList[$word]= true;
}
return self::$sintaticsList;
}
/**
* This method will build the structure of quantifiers
* from the passed XML
*
* @name loadQuantifiers
* @param SimpleXML $xml
* @return AssocArray
*/
public static function loadQuantifiers($xml)
{
self::$quantifiers= Array();
self::$quantifiers['none'] = explode(',', str_replace(', ', ',', (String)$xml->none));
self::$quantifiers['one'] = explode(',', str_replace(', ', ',', (String)$xml->one));
self::$quantifiers['many'] = explode(',', str_replace(', ', ',', (String)$xml->many));
self::$quantifiers['or'] = explode(',', str_replace(', ', ',', (String)$xml->or));
return self::$quantifiers;
}
/**
* This method loads and builds the structure required
* to use and identify the qualifiers, from a passed XML
*
* @name loadQualifiers
* @param SimpleXML $xml
* @return AssocArray
*/
public static function loadQualifiers($xml)
{
self::$qualifiers= Array();
self::$qualifiers['must'] = explode(',', str_replace(', ', ',', (String)$xml->must));
self::$qualifiers['may'] = explode(',', str_replace(', ', ',', (String)$xml->may));
self::$qualifiers['notnull'] = explode(',', str_replace(', ', ',', (String)$xml->notnull));
self::$qualifiers['key'] = explode(',', str_replace(', ', ',', (String)$xml->key));
return self::$qualifiers;
}
/**
* This method verifies whether the passed word is
* a valid quantifier in the passed list of quantifiers
*
* @param string $which In which quantifier the word should be searched
* @param string $what The word to be verified
* @return boolean
*/
public static function isQuantifier($which, $what)
{
return (isset(self::$quantifiers[$which]) &&
in_array($what, self::$quantifiers[$which]));
}
/**
* This method verifies whether the passed word is
* a valid quantifier in the passed list of qualifiers
*
* @param string $which In which qualifier the word should be searched
* @param string $what The word to be verified
* @return boolean
*/
public static function isQualifier($which, $what)
{
return (isset(self::$qualifiers[$which]) &&
in_array($what, self::$qualifiers[$which]));
}
/**
* This method is called to load each possible modifier
*/
public static function loadModifiers()
{
if(!file_exists('sintatics.list'))
{
self::loadSintatics(fopen(Mind::$langPath.Mind::$l10n->name.'/sintatics.list', 'rb'));
$qnt= simplexml_load_file(Mind::$langPath.
Mind::$l10n->name.
'/quantifiers.xml');
$qlf= simplexml_load_file(Mind::$langPath.
Mind::$l10n->name.
'/qualifiers.xml');
self::loadQuantifiers($qnt);
self::loadQualifiers($qlf);
}else{
self::loadSintatics(fopen('sintatics.list', 'rb'));
$qnt= simplexml_load_file('quantifiers.xml');
$qlf= simplexml_load_file('qualifiers.xml');
self::loadQuantifiers($qnt);
self::loadQualifiers($qlf);
}
self::$sintaticsList= Array();
}
/**
* This method runs through all the words within Mind::$content
* and perform all verifications
*/
public function sweep()
{
self::loadModifiers();
$cont= Mind::$content;
foreach($cont as $word)
{
$word= strtolower($word);
if(IgnoreForms::shouldBeIgnored($word))
{
self::$spine[]= MT_ANY;
continue;
}
if($word==',')
{
self::$spine[]= MT_COMA;
continue;
}
if($word=='.')
{
self::$spine[]= MT_PERIOD;
continue;
}
// let's check for quantifiers
if(self::isQuantifier('none', $word))
{
self::$spine[]= MT_NONE;
continue;
}
if(self::isQuantifier('one', $word))
{
self::$spine[]= MT_ONE;
continue;
}
if(self::isQuantifier('many', $word))
{
self::$spine[]= MT_MANY;
continue;
}
if(self::isQuantifier('or', $word))
{
self::$spine[]= MT_OR;
continue;
}
// and here, the qualifiers
if(self::isQualifier('must', $word))
{
self::$spine[]= MT_QMUST;
continue;
}
if(self::isQualifier('may', $word))
{
self::$spine[]= MT_QMAY;
continue;
}
if(self::isQualifier('notnull', $word))
{
self::$spine[]= MT_QNOTNULL;
continue;
}
if(self::isQualifier('key', $word))
{
self::$spine[]= MT_QKEY;
continue;
}
// we know these words are already on its
// canonic form, so, we can simply look for
// it on the list
if(Verbalizer::isInVerbList($word))
{
self::$spine[]= MT_VERB;
continue;
}
self::$spine[]= MT_SUBST;
}
print_r(self::$spine); // AQUI
print_r(Mind::$content);
}
}
+5 -2
Ver Arquivo
@@ -3,6 +3,7 @@
* This class provides a list of instructtions
* which define when a word should be ignored,
* plus a list of key words to ignore
*
* @package cortex.analyst
* @author felipe
*/
@@ -27,12 +28,14 @@
* This method reads the ignore.list file and
* parses it to an indexed array
* @static
* @static
* @name loadVerbs
*/
public static function loadIgnoreList()
{
$fR= fopen('ignore.list', 'rb');
if(!file_exists('ignore.list'))
$fR= fopen(Mind::$langPath.Mind::$l10n->name.'/ignore.list', 'rb');
else
$fR= fopen('ignore.list', 'rb');
self::$ignoreList= Array();
while (!feof($fR)){
$word= preg_replace('/\s/', '', fgets($fR, 4096));
+3 -1
Ver Arquivo
@@ -7,7 +7,7 @@
This classe has been partly inspired on the above cite codes.
The other methods and all the regular expression except the ones
refered to plural and singular on english were created by:
Felip Nascimento de Moura <felipenmoura@gmail.com>
Felipe Nascimento de Moura <felipenmoura@gmail.com>
* You can contribute changing this file and telling me, or maybe
* adding tests to it, and in case you find anything weird, please
* let me know :)
@@ -117,6 +117,8 @@ class Inflect implements inflection
'que',
'de',
'da',
'vários',
'várias',
'oculos',
'átlas',
'atlas',
+1 -5
Ver Arquivo
@@ -63,14 +63,10 @@ class Verbalizer {
'dou' => 'dar',
'deu' => 'dar',
'dão' => 'dar',
'deram' => 'dar',
'dará' => 'dar',
'darão' => 'dar',
'teem' => 'ter',
'terem' => 'ter',
'terão' => 'ter',
'tiveram' => 'ter',
'terão' => 'ter'
);
/**
+6 -3
Ver Arquivo
@@ -1,4 +1,7 @@
de
a
o
as
os
aos
que
da
do
bem
+14
Ver Arquivo
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Document : qualifiers.xml
Created on : december 21, 2010, 1:57 Am
Author : felipenmoura
Description:
This file describes the words that may be used as qualifiers
-->
<root>
<must>dever,precisar,demandar,necessitar</must>
<may>poder</may>
<notnull>obrigatório,not null,notnull,não nulo,requerido,required,necessário</notnull>
<key>chave,key,pk,indice,índice</key>
</root>
+6 -15
Ver Arquivo
@@ -2,22 +2,13 @@
<!--
Document : quantifiers.xml
Created on : November 7, 2010, 4:29 PM
Author : felipe
Author : felipenmoura
Description:
This file describes the words that may be used as quantifiers
-->
<root>
<many-to-many>
</many-to-many>
<many-to-one>
</many-to-one>
<one-to-many>
</one-to-many>
<one-to-one>
</one-to-one>
</root>
<none>nenhum,nada,zero,0,nenhuma</none>
<one>um,uma,1</one>
<many>uns,alguns,umas,algumas,muito,muita,muitos,vários,muitas,várias,diversos,diversas,n,m,x</many>
<or>ou,até,/</or>
</root>
+13
Ver Arquivo
@@ -0,0 +1,13 @@
/*
Q=May have, Must have
S=Substantive
V=Verb
0,N=Quantifiers
O=Or
*/
SVS
SQVS
SV0ONS
SVNS
SQV0ONS
SQVNS
+2 -3
Ver Arquivo
@@ -3092,6 +3092,7 @@ pleitear
plotar
plugar
podar
poder
poetizar
polarizar
polemizar
@@ -4141,6 +4142,4 @@ ziguezaguear
zombar
zumbir
zunir
zurrar
âmbar
éter
zurrar
+3 -3
Ver Arquivo
@@ -61,11 +61,11 @@ EOT
// search for special/unknown characters
if(!Mind::$lexer->sweep($main))
return false;
if(!Mind::$canonic->sweep(Mind::$content))
if(!Mind::$canonic->sweep())
return false;
// mark specific tokens
//if(!Mind::tokenizer::sweep($main))
// return false;
if(!Mind::$tokenizer->sweep())
return false;
// keep substantives and verbs on their canonical form
// on male singular, for example
//if(!Mind::canonic::sweep($main))
+2 -2
Ver Arquivo
@@ -1,2 +1,2 @@
bem, sabemos que todo professor pode ter muitos alunos & que cada aluno pode ter vários professores.
Também, que cada professora tem um chefe.
todos os professores tem um ou vários alunos.
Chave, notnull, precisa, pode.