TextStatistics.php

<?php

    /*

        TextStatistics Class
        http://code.google.com/p/php-text-statistics/

        Released under New BSD license
        http://www.opensource.org/licenses/bsd-license.php

        Calculates following readability scores (formulae can be found in wiki):
          * Flesch Kincaid Reading Ease
          * Flesch Kincaid Grade Level
          * Gunning Fog Score
          * Coleman Liau Index
          * SMOG Index
          * Automated Reability Index

        Will also give:
          * String length
          * Letter count
          * Syllable count
          * Sentence count
          * Average words per sentence
          * Average syllables per word
        
        Sample Code
        ----------------
        $statistics = new TextStatistics;
        $text = 'The quick brown fox jumped over the lazy dog.';
        echo 'Flesch-Kincaid Reading Ease: ' . $statistics->flesch_kincaid_reading_ease($text);

    */

    class TextStatistics {

        protected $strEncoding = ''; // Used to hold character encoding to be used by object, if set

        /**
         * Constructor.
         *
         * @param string  $strEncoding    Optional character encoding.
         * @return void
         */
        public function __construct($strEncoding = '') {
            if ($strEncoding <> '') {
                // Encoding is given. Use it!
                $this->strEncoding = $strEncoding;
            }
        }

        /**
         * Gives the Flesch-Kincaid Reading Ease of text entered rounded to one digit
         * @param   strText         Text to be checked
         */
        function flesch_kincaid_reading_ease($strText) {
            $strText = $this->clean_text($strText);
            return round((206.835 - (1.015 * $this->average_words_per_sentence($strText)) - (84.6 * $this->average_syllables_per_word($strText))), 1);
        }

        /**
         * Gives the Flesch-Kincaid Grade level of text entered rounded to one digit
         * @param   strText         Text to be checked
         */
        function flesch_kincaid_grade_level($strText) {
            $strText = $this->clean_text($strText);
            return round(((0.39 * $this->average_words_per_sentence($strText)) + (11.8 * $this->average_syllables_per_word($strText)) - 15.59), 1);
        }

        /**
         * Gives the Gunning-Fog score of text entered rounded to one digit
         * @param   strText         Text to be checked
         */
        public function gunning_fog_score($strText) {
            $strText = $this->clean_text($strText);
            return round((($this->average_words_per_sentence($strText) + $this->percentage_words_with_three_syllables($strText, false)) * 0.4), 1);
        }

        /**
         * Gives the Coleman-Liau Index of text entered rounded to one digit
         * @param   strText         Text to be checked
         */
        public function coleman_liau_index($strText) {
            $strText = $this->clean_text($strText);
            return round( ( (5.89 * ($this->letter_count($strText) / $this->word_count($strText))) - (0.3 * ($this->sentence_count($strText) / $this->word_count($strText))) - 15.8 ), 1);
        }

        /**
         * Gives the SMOG Index of text entered rounded to one digit
         * @param   strText         Text to be checked
         */
        public function smog_index($strText) {
            $strText = $this->clean_text($strText);
            return round(1.043 * sqrt(($this->words_with_three_syllables($strText) * (30 / $this->sentence_count($strText))) + 3.1291), 1);
        }

        /**
         * Gives the Automated Readability Index of text entered rounded to one digit
         * @param   strText         Text to be checked
         */
        public function automated_readability_index($strText) {
            $strText = $this->clean_text($strText);
            return round(((4.71 * ($this->letter_count($strText) / $this->word_count($strText))) + (0.5 * ($this->word_count($strText) / $this->sentence_count($strText))) - 21.43), 1);
        }

        /**
         * Gives string length. Tries mb_strlen and if that fails uses regular strlen.
         * @param   strText      Text to be measured
         */
        public function text_length($strText) {
            $intTextLength = 0;
            try {
                if ($this->strEncoding == '') {
                    $intTextLength = mb_strlen($strText);
                } else {
                    $intTextLength = mb_strlen($strText, $this->strEncoding);
                }
            } catch (Exception $e) {
                $intTextLength = strlen($strText);
            }
            return $intTextLength;
        }

        /**
         * Gives letter count (ignores all non-letters). Tries mb_strlen and if that fails uses regular strlen.
         * @param   strText      Text to be measured
         */
        public function letter_count($strText) {
            $strText = $this->clean_text($strText); // To clear out newlines etc
            $intTextLength = 0;
            $strText = preg_replace('/[^A-Za-z]+/', '', $strText);
            try {
                if ($this->strEncoding == '') {
                    $intTextLength = mb_strlen($strText);
                } else {
                    $intTextLength = mb_strlen($strText, $this->strEncoding);
                }
            } catch (Exception $e) {
                $intTextLength = strlen($strText);
            }
            return $intTextLength;
        }

        /**
         * Trims, removes line breaks, multiple spaces and generally cleans text before processing.
         * @param   strText      Text to be transformed
         */
        protected function clean_text($strText) {
            // all these tags should be preceeded by a full stop. 
            $fullStopTags = array('li', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'dd');
            foreach ($fullStopTags as $tag) {
                $strText = str_ireplace('</'.$tag.'>', '.', $strText);
            }
            $strText = strip_tags($strText);
            $strText = preg_replace('/[,:;()-]/', ' ', $strText); // Replace commans, hyphens etc (count them as spaces)
            $strText = preg_replace('/[\.!?]/', '.', $strText); // Unify terminators
            $strText = trim($strText) . '.'; // Add final terminator, just in case it's missing.
            $strText = preg_replace('/[ ]*(\n|\r\n|\r)[ ]*/', ' ', $strText); // Replace new lines with spaces
            $strText = preg_replace('/([\.])[\. ]+/', '$1', $strText); // Check for duplicated terminators
            $strText = trim(preg_replace('/[ ]*([\.])/', '$1 ', $strText)); // Pad sentence terminators
            $strText = preg_replace('/[ ]+/', ' ', $strText); // Remove multiple spaces
            $strText = preg_replace_callback('/\. [^ ]+/', create_function('$matches', 'return strtolower($matches[0]);'), $strText); // Lower case all words following terminators (for gunning fog score)
            return $strText;
        }

        /**
         * Converts string to lower case. Tries mb_strtolower and if that fails uses regular strtolower.
         * @param   strText      Text to be transformed
         */
        protected function lower_case($strText) {
            $strLowerCaseText = '';
            try {
                if ($this->strEncoding == '') {
                    $strLowerCaseText = mb_strtolower($strText);
                } else {
                    $strLowerCaseText = mb_strtolower($strText, $this->strEncoding);
                }
            } catch (Exception $e) {
                $strLowerCaseText = strtolower($strText);
            }
            return $strLowerCaseText;
        }

        /**
         * Converts string to upper case. Tries mb_strtoupper and if that fails uses regular strtoupper.
         * @param   strText      Text to be transformed
         */
        protected function upper_case($strText) {
            $strUpperCaseText = '';
            try {
                if ($this->strEncoding == '') {
                    $strUpperCaseText = mb_strtoupper($strText);
                } else {
                    $strUpperCaseText = mb_strtoupper($strText, $this->strEncoding);
                }
            } catch (Exception $e) {
                $strUpperCaseText = strtoupper($strText);
            }
            return $strUpperCaseText;
        }

        /**
         * Gets portion of string. Tries mb_substr and if that fails uses regular substr.
         * @param   strText      Text to be cut up
         * @param   intStart     Start character
         * @param   intLenght    Length
         */
        protected function substring($strText, $intStart, $intLength) {
            $strSubstring = '';
            try {
                if ($this->strEncoding == '') {
                    $strSubstring = mb_substr($strText, $intStart, $intLength);
                } else {
                    $strSubstring = mb_substr($strText, $intStart, $intLength, $this->strEncoding);
                }
            } catch (Exception $e) {
                $strSubstring = substr($strText, $intStart, $intLength);
            }
            return $strSubstring;
        }

        /**
         * Returns sentence count for text.
         * @param   strText      Text to be measured
         */
        public function sentence_count($strText) {
            $strText = $this->clean_text($strText);
            // Will be tripped up by "Mr." or "U.K.". Not a major concern at this point.
            $intSentences = max(1, $this->text_length(preg_replace('/[^\.!?]/', '', $strText)));
            return $intSentences;
        }

        /**
         * Returns word count for text.
         * @param   strText      Text to be measured
         */
        public function word_count($strText) {
            $strText = $this->clean_text($strText);
            // Will be tripped by by em dashes with spaces either side, among other similar characters
            $intWords = 1 + $this->text_length(preg_replace('/[^ ]/', '', $strText)); // Space count + 1 is word count
            return $intWords;
        }

        /**
         * Returns average words per sentence for text.
         * @param   strText      Text to be measured
         */
        public function average_words_per_sentence($strText) {
            $strText = $this->clean_text($strText);
            $intSentenceCount = $this->sentence_count($strText);
            $intWordCount = $this->word_count($strText);
            return ($intWordCount / $intSentenceCount);
        }

        /**
         * Returns average syllables per word for text.
         * @param   strText      Text to be measured
         */
        public function average_syllables_per_word($strText) {
            $strText = $this->clean_text($strText);
            $intSyllableCount = 0;
            $intWordCount = $this->word_count($strText);
            $arrWords = explode(' ', $strText);
            for ($i = 0; $i < $intWordCount; $i++) {
                $intSyllableCount += $this->syllable_count($arrWords[$i]);
            }
            return ($intSyllableCount / $intWordCount);
        }

        /**
         * Returns the number of words with more than three syllables
         * @param   strText                  Text to be measured
         * @param   blnCountProperNouns      Boolean - should proper nouns be included in words count
         */
        public function words_with_three_syllables($strText, $blnCountProperNouns = true) {
            $strText = $this->clean_text($strText);
            $intLongWordCount = 0;
            $intWordCount = $this->word_count($strText);
            $arrWords = explode(' ', $strText);
            for ($i = 0; $i < $intWordCount; $i++) {
                if ($this->syllable_count($arrWords[$i]) > 2) {
                    if ($blnCountProperNouns) { 
                        $intLongWordCount++;
                    } else {
                        $strFirstLetter = $this->substring($arrWords[$i], 0, 1);
                        if ($strFirstLetter !== $this->upper_case($strFirstLetter)) {
                            // First letter is lower case. Count it.
                            $intLongWordCount++;
                        }
                    }
                }
            }
            return ($intLongWordCount);
        }

        /**
         * Returns the percentage of words with more than three syllables
         * @param   strText      Text to be measured
         * @param   blnCountProperNouns      Boolean - should proper nouns be included in words count
         */
        public function percentage_words_with_three_syllables($strText, $blnCountProperNouns = true) {
            $strText = $this->clean_text($strText);
            $intWordCount = $this->word_count($strText);
            $intLongWordCount = $this->words_with_three_syllables($strText, $blnCountProperNouns);
            $intPercentage = (($intLongWordCount / $intWordCount) * 100);
            return ($intPercentage);
        }

        /**
         * Returns the number of syllables in the word.
         * Based in part on Greg Fast's Perl module Lingua::EN::Syllables
         * @param   strWord      Word to be measured
         */
        public function syllable_count($strWord) {

            $intSyllableCount = 0;
            $strWord = $this->lower_case($strWord);

            // Specific common exceptions that don't follow the rule set below are handled individually
            // Array of problem words (with word as key, syllable count as value)
            $arrProblemWords = Array(
                 'simile' => 3
                ,'forever' => 3
                ,'shoreline' => 2
            );
            if (isset($arrProblemWords[$strWord])) {
            	$intSyllableCount = $arrProblemWords[$strWord];
            }
            if ($intSyllableCount > 0) { 
                return $intSyllableCount;
            }

            // These syllables would be counted as two but should be one
            $arrSubSyllables = Array(
                 'cial'
                ,'tia'
                ,'cius'
                ,'cious'
                ,'giu'
                ,'ion'
                ,'iou'
                ,'sia$'
                ,'[^aeiuoyt]{2,}ed$'
                ,'.ely$'
                ,'[cg]h?e[rsd]?$'
                ,'rved?$'
                ,'[aeiouy][dt]es?$'
                ,'[aeiouy][^aeiouydt]e[rsd]?$'
                ,'^[dr]e[aeiou][^aeiou]+$' // Sorts out deal, deign etc
                ,'[aeiouy]rse$' // Purse, hearse
            );

            // These syllables would be counted as one but should be two
            $arrAddSyllables = Array(
                 'ia'
                ,'riet'
                ,'dien'
                ,'iu'
                ,'io'
                ,'ii'
                ,'[aeiouym]bl$'
                ,'[aeiou]{3}'
                ,'^mc'
                ,'ism$'
                ,'([^aeiouy])\1l$'
                ,'[^l]lien'
                ,'^coa[dglx].'
                ,'[^gq]ua[^auieo]'
                ,'dnt$'
                ,'uity$'
                ,'ie(r|st)$'
            );

            // Single syllable prefixes and suffixes
            $arrPrefixSuffix = Array(
                 '/^un/'
                ,'/^fore/'
                ,'/ly$/'
                ,'/less$/'
                ,'/ful$/'
                ,'/ers?$/'
                ,'/ings?$/'
            );

            // Remove prefixes and suffixes and count how many were taken
            $strWord = preg_replace($arrPrefixSuffix, '', $strWord, -1, $intPrefixSuffixCount);

            // Removed non-word characters from word
            $strWord = preg_replace('/[^a-z]/is', '', $strWord);
            $arrWordParts = preg_split('/[^aeiouy]+/', $strWord);
            $intWordPartCount = 0;
            foreach ($arrWordParts as $strWordPart) {
                if ($strWordPart <> '') {
                    $intWordPartCount++;
                }
            }

            // Some syllables do not follow normal rules - check for them
            // Thanks to Joe Kovar for correcting a bug in the following lines
            $intSyllableCount = $intWordPartCount + $intPrefixSuffixCount;
            foreach ($arrSubSyllables as $strSyllable) {
                $intSyllableCount -= preg_match('~' . $strSyllable . '~', $strWord);
            }
            foreach ($arrAddSyllables as $strSyllable) {
                $intSyllableCount += preg_match('~' . $strSyllable . '~', $strWord);
            }
            $intSyllableCount = ($intSyllableCount == 0) ? 1 : $intSyllableCount;
            return $intSyllableCount;
        }

    }

?>