= $strlen) { break; } } // Add last bit, if not ending with split $lengths[] = array($strlen - $position, false, null); // Substrings $parts = array(); $position = 0; $count = 1; foreach ($lengths as $length) { $is_delimiter = $length[1]; $is_captured = $length[2]; if ($limit > 0 && !$is_delimiter && ($length[0] || ~$flags & PREG_SPLIT_NO_EMPTY) && ++$count > $limit) { if ($length[0] > 0 || ~$flags & PREG_SPLIT_NO_EMPTY) { $parts[] = $flags & PREG_SPLIT_OFFSET_CAPTURE ? array(mb_strcut($string, $position), $position) : mb_strcut($string, $position); } break; } elseif ((!$is_delimiter || ($flags & PREG_SPLIT_DELIM_CAPTURE && $is_captured)) && ($length[0] || ~$flags & PREG_SPLIT_NO_EMPTY)) { $parts[] = $flags & PREG_SPLIT_OFFSET_CAPTURE ? array(mb_strcut($string, $position, $length[0]), $position) : mb_strcut($string, $position, $length[0]); } $position += $length[0]; } return $parts; } /** * Breaks a piece of text into lines by linebreak. * Eats up any linebreak characters as if one. * * Multibyte safe * * @param string $text * @return array */ private static function linebreakSplit($text) { $lines = array(); $line = ''; foreach (self::mbSplit('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) { $line .= $part; if (self::mbTrim($part) === '') { $lines[] = $line; $line = ''; } } $lines[] = $line; return $lines; } /** * Splits an array of lines by (consecutive sequences of) * terminals, keeping terminals. * * Multibyte safe (atleast for UTF-8) * * For example: * "There ... is. More!" * ... becomes ... * [ "There ", "...", " is", ".", " More", "!" ] * * @param array $lines * @return array */ private function punctuationSplit($line) { $parts = array(); $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe! $is_terminal = in_array($chars[0], $this->terminals); $part = ''; foreach ($chars as $index => $char) { if (in_array($char, $this->terminals) !== $is_terminal) { $parts[] = $part; $part = ''; $is_terminal = !$is_terminal; } $part .= $char; } if (!empty($part)) { $parts[] = $part; } return $parts; } /** * Appends each terminal item after it's preceding * non-terminals. * * Multibyte safe (atleast for UTF-8) * * For example: * [ "There ", "...", " is", ".", " More", "!" ] * ... becomes ... * [ "There ... is.", "More!" ] * * @param array $punctuations * @return array */ private function punctuationMerge($punctuations) { $definite_terminals = array_diff($this->terminals, $this->abbreviators); $merges = array(); $merge = ''; foreach ($punctuations as $punctuation) { if ($punctuation !== '') { $merge.= $punctuation; if (mb_strlen($punctuation) === 1 && in_array($punctuation, $this->terminals)) { $merges[] = $merge; $merge = ''; } else { foreach ($definite_terminals as $terminal) { if (mb_strpos($punctuation, $terminal) !== false) { $merges[] = $merge; $merge = ''; break; } } } } } if (!empty($merge)) { $merges[] = $merge; } return $merges; } /** * Merges any one-word items with it's preceding items. * * Multibyte safe * * For example: * [ "There ... is.", "More!" ] * ... becomes ... * [ "There ... is. More!" ] * * @param array $fragments * @return array */ private function abbreviationMerge($fragments) { $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators); $abbreviations = array(); $abbreviation = ''; $previous_word_count = null; $previous_word_ending = null; foreach ($fragments as $fragment) { $word_count = count(mb_split('\s+', self::mbTrim($fragment))); $starts_with_space = mb_ereg_match('^\s+', $fragment); $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals); if ($after_non_abbreviating_terminal || ($previous_word_count !== null && ($previous_word_count !== 1 || $word_count !== 1) && $starts_with_space)) { $abbreviations[] = $abbreviation; $abbreviation = ''; } $abbreviation .= $fragment; $previous_word_count = $word_count; $previous_word_ending = mb_substr($fragment, -1); } if ($abbreviation !== '') { $abbreviations[] = $abbreviation; } return $abbreviations; } /** * Merges items into larger sentences. * * Multibyte safe * * @param array $shorts * @return array */ private function sentenceMerge($shorts) { $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators); $sentences = array(); $sentence = ''; $has_words = false; $previous_word_ending = null; foreach ($shorts as $short) { $word_count = count(mb_split('\s+', self::mbTrim($short))); $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals); if ($after_non_abbreviating_terminal || ($has_words && $word_count > 1)) { $sentences[] = $sentence; $sentence = ''; $has_words = $word_count > 1; } else { $has_words = $has_words || $word_count > 1; } $sentence.= $short; $previous_word_ending = mb_substr($short, -1); } if (!empty($sentence)) { $sentences[] = $sentence; } return $sentences; } /** * Return the sentences sentences detected in the provided text. * Set the Sentence::SPLIT_TRIM flag to trim whitespace. * @param string $text * @param integer $flags * @return array */ public function split($text, $flags = 0) { $sentences = array(); // Split foreach (self::linebreakSplit($text) as $line) { if (self::mbTrim($line) !== '') { $punctuations = $this->punctuationSplit($line); $merges = $this->punctuationMerge($punctuations); $shorts = $this->abbreviationMerge($merges); $sentences = array_merge($sentences, $this->sentenceMerge($shorts)); } } // Post process if ($flags & self::SPLIT_TRIM) { foreach ($sentences as &$sentence) { $sentence = self::mbTrim($sentence); } unset($sentence); } return $sentences; } /** * Return the number of sentences detected in the provided text. * @param string $text * @return integer */ public function count($text) { return count($this->split($text)); } }