Sid Gifari File Manager
🏠 Root
/
home
/
genremedia08
/
musicjukebox.overlookedtracks.com
/
vendor
/
teamtnt
/
tntsearch
/
src
/
Stemmer
/
Editing: FrenchStemmer.php
<?php namespace TeamTNT\TNTSearch\Stemmer; /** * * @link http://snowball.tartarus.org/algorithms/french/stemmer.html * The original author is wamania * */ class FrenchStemmer implements Stemmer { /** * All french vowels */ protected static $vowels = ['a', 'e', 'i', 'o', 'u', 'y', 'â', 'à', 'ë', 'é', 'ê', 'è', 'ï', 'î', 'ô', 'û', 'ù']; protected $word; /** * helper, contains stringified list of vowels * @var string */ protected $plainVowels; /** * The original word, use to check if word has been modified * @var string */ protected $originalWord; /** * RV value * @var string */ protected $rv; /** * RV index (based on the beginning of the word) * @var int */ protected $rvIndex; /** * R1 value * @var int */ protected $r1; /** * R1 index (based on the beginning of the word) * @var int */ protected $r1Index; /** * R2 value * @var int */ protected $r2; /** * R2 index (based on the beginning of the word) * @var int */ protected $r2Index; public static function stem($word) { return (new static)->analyze($word); } public function analyze($word) { $this->word = mb_strtolower($word); $this->plainVowels = implode('', static::$vowels); $this->step0(); $this->rv(); $this->r1(); $this->r2(); // to know if step1, 2a or 2b have altered the word $this->originalWord = $this->word; $nextStep = $this->step1(); // Do step 2a if either no ending was removed by step 1, or if one of endings amment, emment, ment, ments was found. if (($nextStep == 2) || ($this->originalWord === $this->word) ) { $modified = $this->step2a(); if (!$modified) { $this->step2b(); } } if ($this->word != $this->originalWord) { $this->step3(); } else { $this->step4(); } $this->step5(); $this->step6(); $this->finish(); return $this->word; } /** * Assume the word is in lower case. * Then put into upper case u or i preceded and followed by a vowel, and y preceded or followed by a vowel. * u after q is also put into upper case. For example, * jouer -> joUer * ennuie -> ennuIe * yeux -> Yeux * quand -> qUand */ private function step0() { $this->word = preg_replace('#([q])u#u', '$1U', $this->word); $this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word); $this->word = preg_replace('#y(['.$this->plainVowels.'])#u', 'Y$1', $this->word); $this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word); $this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word); } /** * Step 1 * Search for the longest among the following suffixes, and perform the action indicated. * * @return integer Next step number */ private function step1() { // ance iqUe isme able iste eux ances iqUes ismes ables istes // delete if in R2 if (($position = $this->search([ 'ances', 'iqUes', 'ismes', 'ables', 'istes', 'ance', 'iqUe','isme', 'able', 'iste', 'eux' ])) !== false) { if ($this->inR2($position)) { $this->word = mb_substr($this->word, 0, $position); } return 3; } // atrice ateur ation atrices ateurs ations // delete if in R2 // if preceded by ic, delete if in R2, else replace by iqU if (($position = $this->search(['atrices', 'ateurs', 'ations', 'atrice', 'ateur', 'ation'])) !== false) { if ($this->inR2($position)) { $this->word = mb_substr($this->word, 0, $position); if (($position2 = $this->searchIfInR2(['ic'])) !== false) { $this->word = mb_substr($this->word, 0, $position2); } else { $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); } } return 3; } // logie logies // replace with log if in R2 if (($position = $this->search(['logies', 'logie'])) !== false) { if ($this->inR2($position)) { $this->word = preg_replace('#(logies|logie)$#u', 'log', $this->word); } return 3; } // usion ution usions utions // replace with u if in R2 if (($position = $this->search(['usions', 'utions', 'usion', 'ution'])) !== false) { if ($this->inR2($position)) { $this->word = preg_replace('#(usion|ution|usions|utions)$#u', 'u', $this->word); } return 3; } // ence ences // replace with ent if in R2 if (($position = $this->search(['ences', 'ence'])) !== false) { if ($this->inR2($position)) { $this->word = preg_replace('#(ence|ences)$#u', 'ent', $this->word); } return 3; } // issement issements // delete if in R1 and preceded by a non-vowel if (($position = $this->search(['issements', 'issement'])) != false) { if ($this->inR1($position)) { $before = $position - 1; $letter = mb_substr($this->word, $before, 1); if (! in_array($letter, static::$vowels)) { $this->word = mb_substr($this->word, 0, $position); } } return 3; } // ement ements // delete if in RV // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, // if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise, // if preceded by abl or iqU, delete if in R2, otherwise, // if preceded by ièr or Ièr, replace by i if in RV if (($position = $this->search(['ements', 'ement'])) !== false) { if ($this->inRv($position)) { $this->word = mb_substr($this->word, 0, $position); } if (($position = $this->searchIfInR2(['iv'])) !== false) { $this->word = mb_substr($this->word, 0, $position); if (($position2 = $this->searchIfInR2(['at'])) !== false) { $this->word = mb_substr($this->word, 0, $position2); } } elseif (($position = $this->search(['eus'])) !== false) { if ($this->inR2($position)) { $this->word = mb_substr($this->word, 0, $position); } elseif ($this->inR1($position)) { $this->word = preg_replace('#(eus)$#u', 'eux', $this->word); } } elseif (($position = $this->searchIfInR2(['abl', 'iqU'])) !== false) { $this->word = mb_substr($this->word, 0, $position); } elseif (($this->searchIfInRv(['ièr', 'Ièr'])) !== false) { $this->word = preg_replace('#(ièr|Ièr)$#u', 'i', $this->word); } return 3; } // ité ités // delete if in R2 // if preceded by abil, delete if in R2, else replace by abl, otherwise, // if preceded by ic, delete if in R2, else replace by iqU, otherwise, // if preceded by iv, delete if in R2 if (($position = $this->search(['ités', 'ité'])) !== false) { // delete if in R2 if ($this->inR2($position)) { $this->word = mb_substr($this->word, 0, $position); } // if preceded by abil, delete if in R2, else replace by abl, otherwise, if (($position = $this->search(['abil'])) !== false) { if ($this->inR2($position)) { $this->word = mb_substr($this->word, 0, $position); } else { $this->word = preg_replace('#(abil)$#u', 'abl', $this->word); } // if preceded by ic, delete if in R2, else replace by iqU, otherwise, } elseif (($position = $this->search(['ic'])) !== false) { if ($this->inR2($position)) { $this->word = mb_substr($this->word, 0, $position); } else { $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); } // if preceded by iv, delete if in R2 } elseif (($position = $this->searchIfInR2(['iv'])) !== false) { $this->word = mb_substr($this->word, 0, $position); } return 3; } // if ive ifs ives // delete if in R2 // if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2, else replace by iqU) if (($position = $this->search(['ifs', 'ives', 'if', 'ive'])) !== false) { if ($this->inR2($position)) { $this->word = mb_substr($this->word, 0, $position); } if (($position = $this->searchIfInR2(['at'])) !== false) { $this->word = mb_substr($this->word, 0, $position); if (($position2 = $this->search(['ic'])) !== false) { if ($this->inR2($position2)) { $this->word = mb_substr($this->word, 0, $position2); } else { $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); } } } return 3; } // eaux // replace with eau if (($this->search(['eaux'])) !== false) { $this->word = preg_replace('#(eaux)$#u', 'eau', $this->word); return 3; } // aux // replace with al if in R1 if (($position = $this->search(['aux'])) !== false) { if ($this->inR1($position)) { $this->word = preg_replace('#(aux)$#u', 'al', $this->word); } return 3; } // euse euses // delete if in R2, else replace by eux if in R1 if (($position = $this->search(['euses', 'euse'])) !== false) { if ($this->inR2($position)) { $this->word = mb_substr($this->word, 0, $position); } elseif ($this->inR1($position)) { $this->word = preg_replace('#(euses|euse)$#u', 'eux', $this->word); } return 3; } // amment // replace with ant if in RV if ( ($position = $this->search(['amment'])) !== false) { if ($this->inRv($position)) { $this->word = preg_replace('#(amment)$#u', 'ant', $this->word); } return 2; } // emment // replace with ent if in RV if (($position = $this->search(['emment'])) !== false) { if ($this->inRv($position)) { $this->word = preg_replace('#(emment)$#u', 'ent', $this->word); } return 2; } // ment ments // delete if preceded by a vowel in RV if (($position = $this->search(['ments', 'ment'])) != false) { $before = $position - 1; $letter = mb_substr($this->word, $before, 1); if ($this->inRv($before) && (in_array($letter, static::$vowels)) ) { $this->word = mb_substr($this->word, 0, $position); } return 2; } return 2; } /** * Step 2a: Verb suffixes beginning i * In steps 2a and 2b all tests are confined to the RV region. * Search for the longest among the following suffixes and if found, delete if preceded by a non-vowel. * îmes ît îtes i ie ies ir ira irai iraIent irais irait iras irent irez iriez * irions irons iront is issaIent issais issait issant issante issantes issants isse * issent isses issez issiez issions issons it * (Note that the non-vowel itself must also be in RV.) */ private function step2a() { if (($position = $this->searchIfInRv([ 'îmes', 'îtes', 'ît', 'ies', 'ie', 'iraIent', 'irais', 'irait', 'irai', 'iras', 'ira', 'irent', 'irez', 'iriez', 'irions', 'irons', 'iront', 'ir', 'issaIent', 'issais', 'issait', 'issant', 'issantes', 'issante', 'issants', 'issent', 'isses', 'issez', 'isse', 'issiez', 'issions', 'issons', 'is', 'it', 'i'])) !== false) { $before = $position - 1; $letter = mb_substr($this->word, $before, 1); if ( $this->inRv($before) && (!in_array($letter, static::$vowels)) ) { $this->word = mb_substr($this->word, 0, $position); return true; } } return false; } /** * Do step 2b if step 2a was done, but failed to remove a suffix. * Step 2b: Other verb suffixes */ private function step2b() { // é ée ées és èrent er era erai eraIent erais erait eras erez eriez erions erons eront ez iez // delete if (($position = $this->searchIfInRv([ 'ées', 'èrent', 'erais', 'erait', 'erai', 'eraIent', 'eras', 'erez', 'eriez', 'erions', 'erons', 'eront', 'era', 'er', 'iez', 'ez','és', 'ée', 'é'])) !== false) { $this->word = mb_substr($this->word, 0, $position); return true; } // âmes ât âtes a ai aIent ais ait ant ante antes ants as asse assent asses assiez assions // delete // if preceded by e, delete if (($position = $this->searchIfInRv([ 'âmes', 'âtes', 'ât', 'aIent', 'ais', 'ait', 'antes', 'ante', 'ants', 'ant', 'assent', 'asses', 'assiez', 'assions', 'asse', 'as', 'ai', 'a'])) !== false) { $before = $position - 1; $letter = mb_substr($this->word, $before, 1); if ( $this->inRv($before) && ($letter === 'e') ) { $this->word = mb_substr($this->word, 0, $before); } else { $this->word = mb_substr($this->word, 0, $position); } return true; } // ions // delete if in R2 if ( ($position = $this->searchIfInRv(array('ions'))) !== false) { if ($this->inR2($position)) { $this->word = mb_substr($this->word, 0, $position); } return true; } return false; } /** * Step 3: Replace final Y with i or final ç with c */ private function step3() { $this->word = preg_replace('#(Y)$#u', 'i', $this->word); $this->word = preg_replace('#(ç)$#u', 'c', $this->word); } /** * Step 4: Residual suffix */ private function step4() { //If the word ends s, not preceded by a, i, o, u, è or s, delete it. if (preg_match('#[^aiouès]s$#', $this->word)) { $this->word = mb_substr($this->word, 0, -1); } // In the rest of step 4, all tests are confined to the RV region. // ion // delete if in R2 and preceded by s or t if ((($position = $this->searchIfInRv(['ion'])) !== false) && ($this->inR2($position)) ) { $before = $position - 1; $letter = mb_substr($this->word, $before, 1); if ( $this->inRv($before) && (($letter === 's') || ($letter === 't')) ) { $this->word = mb_substr($this->word, 0, $position); } return true; } // ier ière Ier Ière // replace with i if (($this->searchIfInRv(['ier', 'ière', 'Ier', 'Ière'])) !== false) { $this->word = preg_replace('#(ier|ière|Ier|Ière)$#u', 'i', $this->word); return true; } // e // delete if (($this->searchIfInRv(['e'])) !== false) { $this->word = mb_substr($this->word, 0, -1); return true; } // ë // if preceded by gu, delete if (($position = $this->searchIfInRv(['guë'])) !== false) { if ($this->inRv($position + 2)) { $this->word = mb_substr($this->word, 0, -1); return true; } } return false; } /** * Step 5: Undouble * If the word ends enn, onn, ett, ell or eill, delete the last letter */ private function step5() { if ($this->search(['enn', 'onn', 'ett', 'ell', 'eill']) !== false) { $this->word = mb_substr($this->word, 0, -1); } } /** * Step 6: Un-accent * If the words ends é or è followed by at least one non-vowel, remove the accent from the e. */ private function step6() { $this->word = preg_replace('#(é|è)([^'.$this->plainVowels.']+)$#u', 'e$2', $this->word); } /** * And finally: * Turn any remaining I, U and Y letters in the word back into lower case. */ private function finish() { $this->word = str_replace(['I','U','Y'], ['i', 'u', 'y'], $this->word); } /** * If the word begins with two vowels, RV is the region after the third letter, * otherwise the region after the first vowel not at the beginning of the word, * or the end of the word if these positions cannot be found. * (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) */ protected function rv() { $length = mb_strlen($this->word); $this->rv = ''; $this->rvIndex = $length; if ($length < 3) { return true; } // If the word begins with two vowels, RV is the region after the third letter $first = mb_substr($this->word, 0, 1); $second = mb_substr($this->word, 1, 1); if ( (in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) { $this->rv = mb_substr($this->word, 3); $this->rvIndex = 3; return true; } // (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) $begin3 = mb_substr($this->word, 0, 3); if (in_array($begin3, ['par', 'col', 'tap'])) { $this->rv = mb_substr($this->word, 3); $this->rvIndex = 3; return true; } // otherwise the region after the first vowel not at the beginning of the word, for ($i = 1; $i < $length; ++$i) { $letter = mb_substr($this->word, $i, 1); if (in_array($letter, static::$vowels)) { $this->rv = mb_substr($this->word, ($i + 1)); $this->rvIndex = $i + 1; return true; } } return false; } protected function inRv($position) { return ($position >= $this->rvIndex); } protected function inR1($position) { return ($position >= $this->r1Index); } protected function inR2($position) { return ($position >= $this->r2Index); } protected function searchIfInRv($suffixes) { return $this->search($suffixes, $this->rvIndex); } protected function searchIfInR2($suffixes) { return $this->search($suffixes, $this->r2Index); } protected function search($suffixes, $offset = 0) { $length = mb_strlen($this->word); if ($offset > $length) { return false; } foreach ($suffixes as $suffixe) { if ((($position = mb_strrpos($this->word, $suffixe, $offset)) !== false) && ((mb_strlen($suffixe) + $position) == $length)) { return $position; } } return false; } /** * R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. */ protected function r1() { list($this->r1Index, $this->r1) = $this->rx($this->word); } /** * R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel. */ protected function r2() { list($index, $value) = $this->rx($this->r1); $this->r2 = $value; $this->r2Index = $this->r1Index + $index; } /** * Common function for R1 and R2 * Search the region after the first non-vowel following a vowel in $word, or the end of the word if there is no such non-vowel. * R1 : $in = $this->word * R2 : $in = R1 */ protected function rx($in) { $length = mb_strlen($in); // defaults $value = ''; $index = $length; // we search all vowels $vowels = []; for ($i = 0; $i < $length; ++$i) { $letter = mb_substr($in, $i, 1); if (in_array($letter, static::$vowels)) { $vowels[] = $i; } } // search the non-vowel following a vowel foreach ($vowels as $position) { $after = $position + 1; $letter = mb_substr($in, $after, 1); if (!in_array($letter, static::$vowels)) { $index = $after + 1; $value = mb_substr($in, ($after + 1)); break; } } return [$index, $value]; } }
Save
Cancel