diff options
Diffstat (limited to 'MLEB/Translate/ttmserver/SolrTTMServer.php')
-rw-r--r-- | MLEB/Translate/ttmserver/SolrTTMServer.php | 445 |
1 files changed, 0 insertions, 445 deletions
diff --git a/MLEB/Translate/ttmserver/SolrTTMServer.php b/MLEB/Translate/ttmserver/SolrTTMServer.php deleted file mode 100644 index bb6c244c..00000000 --- a/MLEB/Translate/ttmserver/SolrTTMServer.php +++ /dev/null @@ -1,445 +0,0 @@ -<?php -/** - * TTMServer - The Translate extension translation memory interface - * - * @file - * @author Niklas Laxström - * @copyright Copyright © 2012-2013, Niklas Laxström - * @license GPL-2.0-or-later - * @ingroup TTMServer - */ - -/** - * TTMServer backed based on Solr instance. Depends on Solarium. - * @since 2012-06-27 - * @ingroup TTMServer - * @deprecated 1.27. Will be removed in 1.29. - */ -class SolrTTMServer - extends TTMServer - implements ReadableTTMServer, SearchableTTMServer, WritableTTMServer -{ - /** - * In case auto-commit is not enabled, or even if it is, tell solr to - * commit before this time has passed, in milliseconds. - */ - const COMMIT_WITHIN = 5000; - - protected $client; - - /** - * Reference to the maintenance script to relay logging output. - */ - protected $logger; - - public function __construct( $config ) { - wfDeprecated( __METHOD__, '1.24' ); - - parent::__construct( $config ); - - if ( isset( $config['config'] ) ) { - $this->client = new Solarium_Client( $config['config'] ); - } else { - $this->client = new Solarium_Client(); - } - } - - public function isLocalSuggestion( array $suggestion ) { - return $suggestion['wiki'] === wfWikiID(); - } - - public function expandLocation( array $suggestion ) { - return $suggestion['uri']; - } - - public function query( $sourceLanguage, $targetLanguage, $text ) { - try { - return $this->doQuery( $sourceLanguage, $targetLanguage, $text ); - } catch ( Solarium_Exception $e ) { - throw new TranslationHelperException( 'Solarium exception: ' . $e ); - } - } - - /// @see ReadableTTMServer::query - protected function doQuery( $sourceLanguage, $targetLanguage, $text ) { - /* Two query system: - * 1) Find all strings in source language that match text - * 2) Do another query for translations for those strings - */ - // For now impose a length limit on query string to avoid doing - // very slow queries. Magic number. - if ( strlen( $text ) > 789 ) { - return []; - } - - $query = $this->client->createSelect(); - $query->setFields( [ 'globalid', 'content', 'score' ] ); - - /* The interface usually displays three best candidates. These might - * come from more than three matches, if the translation is the same. - * This might not find all suggestions, if the top N best matching - * source texts don't have translations, but worse matches do. We - * could loop with start parameter to fetch more until we have enough - * suggestions or the quality drops below the cutoff point. */ - $query->setRows( 25 ); - - /* Our string can contain all kind of nasty characters, so we need - * escape them with great pain. */ - $helper = $query->getHelper(); - $dist = $helper->escapePhrase( $text ); - // "edit" could also be ngram of other algorithm - $dist = "strdist($dist,content,edit)"; - /* Note how we need to escape twice here, first the string for strdist - * and then the strdist call itself for the query. And of course every- - * thing will be URL encoded once sent over the line. */ - $query->setQuery( '_val_:%P1%', [ $dist ] ); - - /* Filter queries are supposed to be efficient as they are separately - * cached, but I haven't done any benchmarks. */ - $query->createFilterQuery( 'lang' ) - ->setQuery( 'language:%P1%', [ $sourceLanguage ] ); - - $resultset = $this->client->select( $query ); - - /* This query is doing two unrelated things: - * 1) Collect the message contents and scores so that they can - * be accessed later for the translations we found. - * 2) Build the query string for the query that fetches the - * translations. - * This code is a bit uglier than I'd like it to be, since there - * there is no field that globally identifies a message (message - * definition and translations). */ - $contents = $scores = []; - $queryString = ''; - foreach ( $resultset as $doc ) { - $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid ); - $contents[$sourceId] = $doc->content; - $scores[$sourceId] = $doc->score; - - $globalid = $helper->escapePhrase( "$sourceId/$targetLanguage" ); - $queryString .= "globalid:$globalid "; - } - - // Second query to fetch available translations - $fetchQuery = $this->client->createSelect(); - $fetchQuery->setFields( [ 'wiki', 'uri', 'content', 'messageid', 'globalid' ] ); - // This come in random order, so have to fetch all and sort - $fetchQuery->setRows( 25 ); - $fetchQuery->setQuery( $queryString ); - // With AND we would not find anything, obviously. - $fetchQuery->setQueryDefaultOperator( Solarium_Query_Select::QUERY_OPERATOR_OR ); - - $translations = $this->client->select( $fetchQuery ); - - $suggestions = []; - foreach ( $translations as $doc ) { - /* Construct the matching source id */ - $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid ); - - /* Unfortunately we cannot do this on the search server, - * because score is not a real field and thus cannot be - * used in a filter query. */ - $quality = $scores[$sourceId]; - if ( $quality < $this->config['cutoff'] ) { - continue; - } - - $suggestions[] = [ - 'source' => $contents[$sourceId], - 'target' => $doc->content, - 'context' => $doc->messageid, - 'quality' => $quality, - 'wiki' => $doc->wiki, - 'location' => $doc->messageid . '/' . $targetLanguage, - 'uri' => $doc->uri, - ]; - } - - /* Like mentioned above, we get results in random order. Sort them - * now to have best matches first as expected by callers. */ - uasort( $suggestions, function ( $a, $b ) { - if ( $a['quality'] === $b['quality'] ) { - return 0; - } - - return ( $a['quality'] < $b['quality'] ) ? 1 : -1; - } ); - - return $suggestions; - } - - /* Write functions */ - - public function update( MessageHandle $handle, $targetText ) { - if ( $handle->getCode() === '' ) { - return false; - } - - /* There are various different cases here: - * [new or updated] [fuzzy|non-fuzzy] [translation|definition] - * 1) We don't distinguish between new or updated here. - * 2) Delete old translation, but not definition - * 3) Insert new translation or definition, if non-fuzzy - * The definition should never be fuzzied anyway. - * - * These only apply to known messages. - */ - - $update = $this->client->createUpdate(); - $title = $handle->getTitle(); - - $doDelete = true; - $sourceLanguage = ''; - if ( $handle->isValid() ) { - $sourceLanguage = $handle->getGroup()->getSourceLanguage(); - if ( $handle->getCode() === $sourceLanguage ) { - $doDelete = false; - } - } - - if ( $doDelete ) { - $base = Title::makeTitle( $title->getNamespace(), $handle->getKey() ); - $conds = [ - 'wiki' => wfWikiID(), - 'language' => $handle->getCode(), - 'messageid' => $base->getPrefixedText(), - ]; - foreach ( $conds as $key => &$value ) { - $value = "$key:" . $update->getHelper()->escapePhrase( $value ); - } - $update->addDeleteQuery( implode( ' AND ', $conds ) ); - } - - if ( $targetText !== null ) { - if ( $handle->isValid() ) { - // Of the message definition page - $targetTitle = $handle->getTitle(); - $sourceTitle = Title::makeTitle( - $targetTitle->getNamespace(), - $handle->getKey() . '/' . $sourceLanguage - ); - $revId = (int)$sourceTitle->getLatestRevID(); - /* Note: in some cases the source page might not exist, in this case - * we use 0 as message version identifier, to differentiate them from - * orphan messages */ - } else { - $revId = 'orphan'; - } - - $doc = $this->createDocument( $handle, $targetText, $revId ); - // Add document and commit within X seconds. - $update->addDocument( $doc, null, self::COMMIT_WITHIN ); - } - - try { - $this->client->update( $update ); - } catch ( Solarium_Exception $e ) { - error_log( 'SolrTTMServer update-write failed' ); - - return false; - } - - return true; - } - - /** - * @see schema.xml - * @param MessageHandle $handle - * @param string $text - * @param int $revId - * @return Solarium_Document_ReadWrite - */ - protected function createDocument( MessageHandle $handle, $text, $revId ) { - $language = $handle->getCode(); - $translationTitle = $handle->getTitle(); - - $title = Title::makeTitle( $handle->getTitle()->getNamespace(), $handle->getKey() ); - $wiki = wfWikiID(); - $messageid = $title->getPrefixedText(); - $globalid = "$wiki-$messageid-$revId/$language"; - - $doc = new Solarium_Document_ReadWrite(); - $doc->wiki = $wiki; - $doc->uri = $translationTitle->getCanonicalURL(); - $doc->messageid = $messageid; - $doc->globalid = $globalid; - - $doc->language = $language; - $doc->content = $text; - $doc->setField( 'group', $handle->getGroupIds() ); - - return $doc; - } - - public function beginBootstrap() { - $update = $this->client->createUpdate(); - $query = 'wiki:' . $update->getHelper()->escapePhrase( wfWikiID() ); - $update->addDeleteQuery( $query ); - $update->addCommit(); - $this->client->update( $update ); - } - - public function beginBatch() { - // I hate the rule that forbids {} - } - - public function batchInsertDefinitions( array $batch ) { - $lb = new LinkBatch(); - foreach ( $batch as $data ) { - $lb->addObj( $data[0]->getTitle() ); - } - $lb->execute(); - - $this->batchInsertTranslations( $batch ); - } - - public function batchInsertTranslations( array $batch ) { - $update = $this->client->createUpdate(); - foreach ( $batch as $key => $data ) { - list( $handle, $sourceLanguage, $text ) = $data; - $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID(); - $doc = $this->createDocument( $handle, $text, $revId ); - // Add document and commit within X seconds. - $update->addDocument( $doc, null, self::COMMIT_WITHIN ); - } - - $retries = 5; - - while ( $retries-- > 0 ) { - try { - $this->client->update( $update ); - break; - } catch ( Solarium_Client_HttpException $e ) { - if ( $retries === 0 ) { - throw $e; - } else { - $c = get_class( $e ); - $msg = $e->getMessage(); - $this->logOutput( "Batch failed ($c: $msg), trying again in 10 seconds" ); - sleep( 10 ); - } - } - } - } - - public function endBatch() { - $update = $this->client->createUpdate(); - $this->client->update( $update ); - } - - public function endBootstrap() { - $update = $this->client->createUpdate(); - $update->addCommit(); - $update->addOptimize(); - $this->client->update( $update ); - } - - public function getSolarium() { - return $this->client; - } - - public function setLogger( $logger ) { - $this->logger = $logger; - } - - // Can it get any uglier? - protected function logOutput( $text ) { - if ( $this->logger ) { - $this->logger->statusLine( "$text\n" ); - } - } - - /** - * Search interface - * @param string $queryString - * @param array $opts - * @param array $highlight - * @return array - */ - public function search( $queryString, $opts, $highlight ) { - $client = $this->getSolarium(); - - $query = $client->createSelect(); - $dismax = $query->getDisMax(); - $dismax->setQueryParser( 'edismax' ); - $query->setQuery( $queryString ); - $query->setRows( $opts['limit'] ); - $query->setStart( $opts['offset'] ); - - list( $pre, $post ) = $highlight; - $hl = $query->getHighlighting(); - $hl->setFields( 'text' ); - $hl->setSimplePrefix( $pre ); - $hl->setSimplePostfix( $post ); - $hl->setMaxAnalyzedChars( '5000' ); - $hl->setFragSize( '5000' ); - $hl->setSnippets( 1 ); - - $languageFilter = $opts['language']; - if ( $languageFilter !== '' ) { - $query->createFilterQuery( 'languageFilter' ) - ->setQuery( 'language:%P1%', [ $languageFilter ] ) - ->addTag( 'filter' ); - } - - $groupFilter = $opts['group']; - if ( $groupFilter !== '' ) { - $query->createFilterQuery( 'groupFilter' ) - ->setQuery( 'group:%P1%', [ $groupFilter ] ) - ->addTag( 'filter' ); - } - - $facetSet = $query->getFacetSet(); - - $language = $facetSet->createFacetField( 'language' ); - $language->setField( 'language' ); - $language->setMinCount( 1 ); - $language->addExclude( 'filter' ); - - $group = $facetSet->createFacetField( 'group' ); - $group->setField( 'group' ); - $group->setMinCount( 1 ); - $group->setMissing( true ); - $group->addExclude( 'filter' ); - - try { - return $client->select( $query ); - } catch ( Solarium_Client_HttpException $e ) { - throw new TTMServer( $e->getMessage() ); - } - } - - public function getFacets( $resultset ) { - return [ - 'language' => iterator_to_array( $resultset->getFacetSet()->getFacet( 'language' ) ), - 'group' => iterator_to_array( $resultset->getFacetSet()->getFacet( 'group' ) ), - ]; - } - - public function getTotalHits( $resultset ) { - return $resultset->getNumFound(); - } - - public function getDocuments( $resultset ) { - $highlighting = $resultset->getHighlighting(); - $ret = []; - foreach ( $resultset as $document ) { - $fields = iterator_to_array( $document ); - // Compatibility mapping - $fields['localid'] = $fields['messageid']; - - $hdoc = $highlighting->getResult( $document->globalid ); - $text = $hdoc->getField( 'text' ); - if ( $text === [] ) { - $text = $document->text; - } else { - $text = $text[0]; - } - - $fields['content'] = $text; - $ret[] = $fields; - } - - return $ret; - } -} |