diff options
Diffstat (limited to 'MLEB/Translate/utils/ExternalMessageSourceStateComparator.php')
-rw-r--r-- | MLEB/Translate/utils/ExternalMessageSourceStateComparator.php | 317 |
1 files changed, 270 insertions, 47 deletions
diff --git a/MLEB/Translate/utils/ExternalMessageSourceStateComparator.php b/MLEB/Translate/utils/ExternalMessageSourceStateComparator.php index 5b24b5e0..a6b53f4f 100644 --- a/MLEB/Translate/utils/ExternalMessageSourceStateComparator.php +++ b/MLEB/Translate/utils/ExternalMessageSourceStateComparator.php @@ -7,33 +7,51 @@ * @license GPL-2.0-or-later * @since 2013.12 */ + +use MediaWiki\Extensions\Translate\MessageSync\MessageSourceChange; +use MediaWiki\Extensions\Translate\Utilities\StringComparators\StringComparator; + class ExternalMessageSourceStateComparator { /** Process all languages supported by the message group */ - const ALL_LANGUAGES = 'all languages'; + public const ALL_LANGUAGES = 'all languages'; - protected $changes = []; + /** + * @var StringComparator + */ + protected $stringComparator; /** - * Finds changes in external sources compared to wiki state. + * @param StringComparator $stringComparator + */ + public function __construct( StringComparator $stringComparator ) { + $this->stringComparator = $stringComparator; + } + + /** + * Finds modifications in external sources compared to wiki state. * - * The returned array is as following: - * - First level is indexed by language code - * - Second level is indexed by change type: - * - - addition (new message in the file) - * - - deletion (message in wiki not present in the file) - * - - change (difference in content) - * - Third level is a list of changes - * - Fourth level is change properties - * - - key (the message key) - * - - content (the message content in external source, null for deletions) + * The MessageSourceChange object returned stores the following about each modification, + * - First level of classification is the language code + * - Second level of classification is the type of modification, + * - addition (new message in the file) + * - deletion (message in wiki not present in the file) + * - change (difference in content) + * - rename (message key is modified) + * - Third level is a list of modifications + * - For each modification, the following is saved, + * - key (the message key) + * - content (the message content in external source, null for deletions) + * - matched_to (present in case of renames, key of the matched message) + * - similarity (present in case of renames, similarity % with the matched message) + * - previous_state (present in case of renames, state of the message before rename) * * @param FileBasedMessageGroup $group * @param array|string $languages - * @throws MWException - * @return array array[language code][change type] = change. + * @throws InvalidArgumentException + * @return MessageSourceChange */ public function processGroup( FileBasedMessageGroup $group, $languages ) { - $this->changes = []; + $changes = new MessageSourceChange(); $processAll = false; if ( $languages === self::ALL_LANGUAGES ) { @@ -47,7 +65,7 @@ class ExternalMessageSourceStateComparator { $languages = array_keys( $languages ); } elseif ( !is_array( $languages ) ) { - throw new MWException( 'Invalid input given for $languages' ); + throw new InvalidArgumentException( 'Invalid input given for $languages' ); } // Process the source language before others. Source language might not @@ -58,23 +76,25 @@ class ExternalMessageSourceStateComparator { $index = array_search( $sourceLanguage, $languages ); if ( $processAll || $index !== false ) { unset( $languages[$index] ); - $this->processLanguage( $group, $sourceLanguage ); + $this->processLanguage( $group, $sourceLanguage, $changes ); } - foreach ( $languages as $code ) { - $this->processLanguage( $group, $code ); + foreach ( $languages as $language ) { + $this->processLanguage( $group, $language, $changes ); } - return $this->changes; + return $changes; } - protected function processLanguage( FileBasedMessageGroup $group, $code ) { - $cache = new MessageGroupCache( $group, $code ); + protected function processLanguage( + FileBasedMessageGroup $group, $language, MessageSourceChange $changes + ) { + $cache = $group->getMessageGroupCache( $language ); $reason = 0; if ( !$cache->isValid( $reason ) ) { - $this->addMessageUpdateChanges( $group, $code, $reason, $cache ); + $this->addMessageUpdateChanges( $group, $language, $changes, $reason, $cache ); - if ( !isset( $this->changes[$code] ) ) { + if ( $changes->getModificationsForLanguage( $language ) === [] ) { /* Update the cache immediately if file and wiki state match. * Otherwise the cache will get outdated compared to file state * and will give false positive conflicts later. */ @@ -93,38 +113,40 @@ class ExternalMessageSourceStateComparator { * Now we must try to guess what in earth has driven the file state and * wiki state out of sync. Then we must compile list of events that would * bring those to sync. Types of events are addition, deletion, (content) - * change and possible rename in the future. After that the list of events - * are stored for later processing of a translation administrator, who can - * decide what actions to take on those events to bring the state more or - * less in sync. + * change and key renames. After that the list of events are stored for + * later processing of a translation administrator, who can decide what + * actions to take on those events to bring the state more or less in sync. * * @param FileBasedMessageGroup $group - * @param string $code Language code. + * @param string $language + * @param MessageSourceChange $changes * @param int $reason * @param MessageGroupCache $cache - * @throws MWException + * @throws RuntimeException */ - protected function addMessageUpdateChanges( FileBasedMessageGroup $group, $code, - $reason, $cache + protected function addMessageUpdateChanges( + FileBasedMessageGroup $group, $language, MessageSourceChange $changes, $reason, $cache ) { /* This throws a warning if message definitions are not yet * cached and will read the file for definitions. */ - MediaWiki\suppressWarnings(); - $wiki = $group->initCollection( $code ); - MediaWiki\restoreWarnings(); + Wikimedia\suppressWarnings(); + $wiki = $group->initCollection( $language ); + Wikimedia\restoreWarnings(); $wiki->filter( 'hastranslation', false ); $wiki->loadTranslations(); $wikiKeys = $wiki->getMessageKeys(); + $sourceLanguage = $group->getSourceLanguage(); // By-pass cached message definitions /** @var FFS $ffs */ $ffs = $group->getFFS(); - if ( $code === $group->getSourceLanguage() && !$ffs->exists( $code ) ) { - $path = $group->getSourceFilePath( $code ); - throw new MWException( "Source message file for {$group->getId()} does not exist: $path" ); + '@phan-var SimpleFFS $ffs'; + if ( $language === $sourceLanguage && !$ffs->exists( $language ) ) { + $path = $group->getSourceFilePath( $language ); + throw new RuntimeException( "Source message file for {$group->getId()} does not exist: $path" ); } - $file = $ffs->read( $code ); + $file = $ffs->read( $language ); // Does not exist if ( $file === false ) { @@ -136,7 +158,7 @@ class ExternalMessageSourceStateComparator { $id = $group->getId(); $ffsClass = get_class( $ffs ); - error_log( "$id has an FFS ($ffsClass) - it didn't return cake for $code" ); + error_log( "$id has an FFS ($ffsClass) - it didn't return cake for $language" ); return; } @@ -146,6 +168,7 @@ class ExternalMessageSourceStateComparator { $common = array_intersect( $fileKeys, $wikiKeys ); $supportsFuzzy = $ffs->supportsFuzzy(); + $changesToRemove = []; foreach ( $common as $key ) { $sourceContent = $file['MESSAGES'][$key]; @@ -185,16 +208,36 @@ class ExternalMessageSourceStateComparator { } } - $this->addChange( 'change', $code, $key, $sourceContent ); + if ( $language !== $sourceLanguage ) { + // Assuming that this is the old key, lets check if it has a corresponding + // rename in the source language. The key of the matching message will be + // the new renamed key. + $renameMsg = $changes->getMatchedMessage( $sourceLanguage, $key ); + if ( $renameMsg !== null ) { + // Rename present in source language but this message has a content change + // with the OLD key in a non-source language. We will not process this + // here but add it as a rename instead. This way, the key will be renamed + // and then the content updated. + $this->addNonSourceRenames( + $changes, $key, $renameMsg['key'], $sourceContent, $wikiContent, $language + ); + $changesToRemove[] = $key; + continue; + } + } + $changes->addChange( $language, $key, $sourceContent ); } + $changes->removeChanges( $language, $changesToRemove ); + $added = array_diff( $fileKeys, $wikiKeys ); foreach ( $added as $key ) { $sourceContent = $file['MESSAGES'][$key]; if ( trim( $sourceContent ) === '' ) { continue; } - $this->addChange( 'addition', $code, $key, $sourceContent ); + + $changes->addAddition( $language, $key, $sourceContent ); } /* Should the cache not exist, don't consider the messages @@ -209,15 +252,195 @@ class ExternalMessageSourceStateComparator { * must be a newly made in the wiki. */ continue; } - $this->addChange( 'deletion', $code, $key, null ); + $changes->addDeletion( $language, $key, $wiki[$key]->translation() ); } } + + if ( $language === $sourceLanguage ) { + $this->findAndMarkSourceRenames( $changes, $language ); + } else { + // Non source language + $this->checkNonSourceAdditionsForRename( + $changes, $sourceLanguage, $language, $wiki, $wikiKeys + ); + } } - protected function addChange( $type, $language, $key, $content ) { - $this->changes[$language][$type][] = [ + /** + * For non source languages, we look at additions and see if they have been + * added as renames in the source language. + * @param MessageSourceChange $changes + * @param string $sourceLanguage + * @param string $targetLanguage + * @param MessageCollection $wiki + * @param string[] $wikiKeys + */ + private function checkNonSourceAdditionsForRename( + MessageSourceChange $changes, $sourceLanguage, $targetLanguage, MessageCollection $wiki, $wikiKeys + ) { + $additions = $changes->getAdditions( $targetLanguage ); + if ( $additions === [] ) { + return; + } + + $additionsToRemove = []; + $deletionsToRemove = []; + foreach ( $additions as $addedMsg ) { + $addedMsgKey = $addedMsg['key']; + + // Check if this key is renamed in source. + $renamedSourceMsg = $changes->findMessage( + $sourceLanguage, $addedMsgKey, [ MessageSourceChange::RENAME ] + ); + + if ( $renamedSourceMsg === null ) { + continue; + } + + // Since this key is new, and is present in the renames for the source language, + // we will add it as a rename. + $deletedSource = $changes->getMatchedMessage( $sourceLanguage, $renamedSourceMsg['key'] ); + $deletedMsgKey = $deletedSource['key']; + $deletedMsg = $changes->findMessage( + $targetLanguage, $deletedMsgKey, [ MessageSourceChange::DELETION ] + ); + + // Sometimes when the cache does not have the translations, the deleted message + // is not added in the translations. It is also possible that for this non-source + // language the key has not been removed. + if ( $deletedMsg === null ) { + $content = ''; + if ( array_search( $deletedMsgKey, $wikiKeys ) !== false ) { + $content = $wiki[ $deletedMsgKey ]->translation(); + } + $deletedMsg = [ + 'key' => $deletedMsgKey, + 'content' => $content + ]; + } + + $similarityPercent = $this->stringComparator->getSimilarity( + $addedMsg['content'], $deletedMsg['content'] + ); + + $changes->addRename( $targetLanguage, [ + 'key' => $addedMsgKey, + 'content' => $addedMsg['content'] + ], [ + 'key' => $deletedMsgKey, + 'content' => $deletedMsg['content'] + ], $similarityPercent ); + + $deletionsToRemove[] = $deletedMsgKey; + $additionsToRemove[] = $addedMsgKey; + } + + $changes->removeAdditions( $targetLanguage, $additionsToRemove ); + $changes->removeDeletions( $targetLanguage, $deletionsToRemove ); + } + + /** + * Check for renames and add them to the changes. To identify renames we need to + * compare the contents of the added messages with the deleted ones and identify + * messages that match. + * @param MessageSourcechange $changes + * @param string $sourceLanguage + */ + private function findAndMarkSourceRenames( MessageSourceChange $changes, $sourceLanguage ) { + // Now check for renames. To identify renames we need to compare + // the contents of the added messages with the deleted ones and + // identify messages that match. + $deletions = $changes->getDeletions( $sourceLanguage ); + $additions = $changes->getAdditions( $sourceLanguage ); + if ( $deletions === [] || $additions === [] ) { + return; + } + + // This array contains a dictionary with matching renames in the following structure - + // [ A1|D1 => 1.0, A1|D2 => 0.95, A2|D1 => 0.95 ] + $potentialRenames = []; + foreach ( $additions as $addedMsg ) { + $addedMsgKey = $addedMsg['key']; + + foreach ( $deletions as $deletedMsg ) { + $similarityPercent = $this->stringComparator->getSimilarity( + $addedMsg['content'], $deletedMsg['content'] + ); + + if ( $changes->areStringsSimilar( $similarityPercent ) ) { + $potentialRenames[ $addedMsgKey . '|' . $deletedMsg['key'] ] = $similarityPercent; + } + } + } + + $this->matchRenames( $changes, $potentialRenames, $sourceLanguage ); + } + + /** + * Adds non source language renames to the list of changes + * @param MessageSourceChange $changes + * @param string $key + * @param string $renameKey + * @param string $sourceContent + * @param string $wikiContent + * @param string $language + */ + private function addNonSourceRenames( + MessageSourceChange $changes, $key, $renameKey, $sourceContent, $wikiContent, $language + ) { + $addedMsg = [ + 'key' => $renameKey, + 'content' => $sourceContent + ]; + + $removedMsg = [ 'key' => $key, - 'content' => $content, + 'content' => $wikiContent ]; + + $similarityPercent = $this->stringComparator->getSimilarity( + $sourceContent, $wikiContent + ); + $changes->addRename( $language, $addedMsg, $removedMsg, $similarityPercent ); + } + + /** + * Identifies which added message to be associated with the deleted message based on + * similarity percentage. + * + * We sort the $trackRename array on the similarity percentage and then start adding the + * messages as renames. + * @param MessageSourceChange $changes + * @param array $trackRename + * @param string $language + */ + private function matchRenames( MessageSourceChange $changes, array $trackRename, $language ) { + arsort( $trackRename, SORT_NUMERIC ); + + $alreadyRenamed = $additionsToRemove = $deletionsToRemove = []; + foreach ( $trackRename as $key => $similarityPercent ) { + list( $addKey, $deleteKey ) = explode( '|', $key, 2 ); + if ( isset( $alreadyRenamed[ $addKey ] ) || isset( $alreadyRenamed[ $deleteKey ] ) ) { + // Already mapped with another name. + continue; + } + + // Using key should be faster than saving values and searching for them in the array. + $alreadyRenamed[ $addKey ] = 1; + $alreadyRenamed[ $deleteKey ] = 1; + + $addMsg = $changes->findMessage( $language, $addKey, [ MessageSourceChange::ADDITION ] ); + $deleteMsg = $changes->findMessage( $language, $deleteKey, [ MessageSourceChange::DELETION ] ); + + $changes->addRename( $language, $addMsg, $deleteMsg, $similarityPercent ); + + // @phan-suppress-next-line PhanTypeArraySuspiciousNullable + $additionsToRemove[] = $addMsg['key']; + // @phan-suppress-next-line PhanTypeArraySuspiciousNullable + $deletionsToRemove[] = $deleteMsg['key']; + } + + $changes->removeAdditions( $language, $additionsToRemove ); + $changes->removeDeletions( $language, $deletionsToRemove ); } } |