diff options
Diffstat (limited to 'www/wiki/includes/specials/SpecialExport.php')
-rw-r--r-- | www/wiki/includes/specials/SpecialExport.php | 593 |
1 files changed, 593 insertions, 0 deletions
diff --git a/www/wiki/includes/specials/SpecialExport.php b/www/wiki/includes/specials/SpecialExport.php new file mode 100644 index 00000000..5a98bb90 --- /dev/null +++ b/www/wiki/includes/specials/SpecialExport.php @@ -0,0 +1,593 @@ +<?php +/** + * Implements Special:Export + * + * Copyright © 2003-2008 Brion Vibber <brion@pobox.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup SpecialPage + */ + +use MediaWiki\MediaWikiServices; + +/** + * A special page that allows users to export pages in a XML file + * + * @ingroup SpecialPage + */ +class SpecialExport extends SpecialPage { + private $curonly, $doExport, $pageLinkDepth, $templates; + + public function __construct() { + parent::__construct( 'Export' ); + } + + public function execute( $par ) { + $this->setHeaders(); + $this->outputHeader(); + $config = $this->getConfig(); + + // Set some variables + $this->curonly = true; + $this->doExport = false; + $request = $this->getRequest(); + $this->templates = $request->getCheck( 'templates' ); + $this->pageLinkDepth = $this->validateLinkDepth( + $request->getIntOrNull( 'pagelink-depth' ) + ); + $nsindex = ''; + $exportall = false; + + if ( $request->getCheck( 'addcat' ) ) { + $page = $request->getText( 'pages' ); + $catname = $request->getText( 'catname' ); + + if ( $catname !== '' && $catname !== null && $catname !== false ) { + $t = Title::makeTitleSafe( NS_MAIN, $catname ); + if ( $t ) { + /** + * @todo FIXME: This can lead to hitting memory limit for very large + * categories. Ideally we would do the lookup synchronously + * during the export in a single query. + */ + $catpages = $this->getPagesFromCategory( $t ); + if ( $catpages ) { + if ( $page !== '' ) { + $page .= "\n"; + } + $page .= implode( "\n", $catpages ); + } + } + } + } elseif ( $request->getCheck( 'addns' ) && $config->get( 'ExportFromNamespaces' ) ) { + $page = $request->getText( 'pages' ); + $nsindex = $request->getText( 'nsindex', '' ); + + if ( strval( $nsindex ) !== '' ) { + /** + * Same implementation as above, so same @todo + */ + $nspages = $this->getPagesFromNamespace( $nsindex ); + if ( $nspages ) { + $page .= "\n" . implode( "\n", $nspages ); + } + } + } elseif ( $request->getCheck( 'exportall' ) && $config->get( 'ExportAllowAll' ) ) { + $this->doExport = true; + $exportall = true; + + /* Although $page and $history are not used later on, we + nevertheless set them to avoid that PHP notices about using + undefined variables foul up our XML output (see call to + doExport(...) further down) */ + $page = ''; + $history = ''; + } elseif ( $request->wasPosted() && $par == '' ) { + $page = $request->getText( 'pages' ); + $this->curonly = $request->getCheck( 'curonly' ); + $rawOffset = $request->getVal( 'offset' ); + + if ( $rawOffset ) { + $offset = wfTimestamp( TS_MW, $rawOffset ); + } else { + $offset = null; + } + + $maxHistory = $config->get( 'ExportMaxHistory' ); + $limit = $request->getInt( 'limit' ); + $dir = $request->getVal( 'dir' ); + $history = [ + 'dir' => 'asc', + 'offset' => false, + 'limit' => $maxHistory, + ]; + $historyCheck = $request->getCheck( 'history' ); + + if ( $this->curonly ) { + $history = WikiExporter::CURRENT; + } elseif ( !$historyCheck ) { + if ( $limit > 0 && ( $maxHistory == 0 || $limit < $maxHistory ) ) { + $history['limit'] = $limit; + } + + if ( !is_null( $offset ) ) { + $history['offset'] = $offset; + } + + if ( strtolower( $dir ) == 'desc' ) { + $history['dir'] = 'desc'; + } + } + + if ( $page != '' ) { + $this->doExport = true; + } + } else { + // Default to current-only for GET requests. + $page = $request->getText( 'pages', $par ); + $historyCheck = $request->getCheck( 'history' ); + + if ( $historyCheck ) { + $history = WikiExporter::FULL; + } else { + $history = WikiExporter::CURRENT; + } + + if ( $page != '' ) { + $this->doExport = true; + } + } + + if ( !$config->get( 'ExportAllowHistory' ) ) { + // Override + $history = WikiExporter::CURRENT; + } + + $list_authors = $request->getCheck( 'listauthors' ); + if ( !$this->curonly || !$config->get( 'ExportAllowListContributors' ) ) { + $list_authors = false; + } + + if ( $this->doExport ) { + $this->getOutput()->disable(); + + // Cancel output buffering and gzipping if set + // This should provide safer streaming for pages with history + wfResetOutputBuffers(); + $request->response()->header( "Content-type: application/xml; charset=utf-8" ); + $request->response()->header( "X-Robots-Tag: noindex,nofollow" ); + + if ( $request->getCheck( 'wpDownload' ) ) { + // Provide a sane filename suggestion + $filename = urlencode( $config->get( 'Sitename' ) . '-' . wfTimestampNow() . '.xml' ); + $request->response()->header( "Content-disposition: attachment;filename={$filename}" ); + } + + $this->doExport( $page, $history, $list_authors, $exportall ); + + return; + } + + $out = $this->getOutput(); + $out->addWikiMsg( 'exporttext' ); + + if ( $page == '' ) { + $categoryName = $request->getText( 'catname' ); + } else { + $categoryName = ''; + } + + $formDescriptor = [ + 'catname' => [ + 'type' => 'textwithbutton', + 'name' => 'catname', + 'horizontal-label' => true, + 'label-message' => 'export-addcattext', + 'default' => $categoryName, + 'size' => 40, + 'buttontype' => 'submit', + 'buttonname' => 'addcat', + 'buttondefault' => $this->msg( 'export-addcat' )->text(), + 'hide-if' => [ '===', 'exportall', '1' ], + ], + ]; + if ( $config->get( 'ExportFromNamespaces' ) ) { + $formDescriptor += [ + 'nsindex' => [ + 'type' => 'namespaceselectwithbutton', + 'default' => $nsindex, + 'label-message' => 'export-addnstext', + 'horizontal-label' => true, + 'name' => 'nsindex', + 'id' => 'namespace', + 'cssclass' => 'namespaceselector', + 'buttontype' => 'submit', + 'buttonname' => 'addns', + 'buttondefault' => $this->msg( 'export-addns' )->text(), + 'hide-if' => [ '===', 'exportall', '1' ], + ], + ]; + } + + if ( $config->get( 'ExportAllowAll' ) ) { + $formDescriptor += [ + 'exportall' => [ + 'type' => 'check', + 'label-message' => 'exportall', + 'name' => 'exportall', + 'id' => 'exportall', + 'default' => $request->wasPosted() ? $request->getCheck( 'exportall' ) : false, + ], + ]; + } + + $formDescriptor += [ + 'textarea' => [ + 'class' => HTMLTextAreaField::class, + 'name' => 'pages', + 'label-message' => 'export-manual', + 'nodata' => true, + 'rows' => 10, + 'default' => $page, + 'hide-if' => [ '===', 'exportall', '1' ], + ], + ]; + + if ( $config->get( 'ExportAllowHistory' ) ) { + $formDescriptor += [ + 'curonly' => [ + 'type' => 'check', + 'label-message' => 'exportcuronly', + 'name' => 'curonly', + 'id' => 'curonly', + 'default' => $request->wasPosted() ? $request->getCheck( 'curonly' ) : true, + ], + ]; + } else { + $out->addWikiMsg( 'exportnohistory' ); + } + + $formDescriptor += [ + 'templates' => [ + 'type' => 'check', + 'label-message' => 'export-templates', + 'name' => 'templates', + 'id' => 'wpExportTemplates', + 'default' => $request->wasPosted() ? $request->getCheck( 'templates' ) : false, + ], + ]; + + if ( $config->get( 'ExportMaxLinkDepth' ) || $this->userCanOverrideExportDepth() ) { + $formDescriptor += [ + 'pagelink-depth' => [ + 'type' => 'text', + 'name' => 'pagelink-depth', + 'id' => 'pagelink-depth', + 'label-message' => 'export-pagelinks', + 'default' => '0', + 'size' => 20, + ], + ]; + } + + $formDescriptor += [ + 'wpDownload' => [ + 'type' => 'check', + 'name' => 'wpDownload', + 'id' => 'wpDownload', + 'default' => $request->wasPosted() ? $request->getCheck( 'wpDownload' ) : true, + 'label-message' => 'export-download', + ], + ]; + + if ( $config->get( 'ExportAllowListContributors' ) ) { + $formDescriptor += [ + 'listauthors' => [ + 'type' => 'check', + 'label-message' => 'exportlistauthors', + 'default' => $request->wasPosted() ? $request->getCheck( 'listauthors' ) : false, + 'name' => 'listauthors', + 'id' => 'listauthors', + ], + ]; + } + + $htmlForm = HTMLForm::factory( 'ooui', $formDescriptor, $this->getContext() ); + $htmlForm->setSubmitTextMsg( 'export-submit' ); + $htmlForm->prepareForm()->displayForm( false ); + $this->addHelpLink( 'Help:Export' ); + } + + /** + * @return bool + */ + private function userCanOverrideExportDepth() { + return $this->getUser()->isAllowed( 'override-export-depth' ); + } + + /** + * Do the actual page exporting + * + * @param string $page User input on what page(s) to export + * @param int $history One of the WikiExporter history export constants + * @param bool $list_authors Whether to add distinct author list (when + * not returning full history) + * @param bool $exportall Whether to export everything + */ + private function doExport( $page, $history, $list_authors, $exportall ) { + // If we are grabbing everything, enable full history and ignore the rest + if ( $exportall ) { + $history = WikiExporter::FULL; + } else { + $pageSet = []; // Inverted index of all pages to look up + + // Split up and normalize input + foreach ( explode( "\n", $page ) as $pageName ) { + $pageName = trim( $pageName ); + $title = Title::newFromText( $pageName ); + if ( $title && !$title->isExternal() && $title->getText() !== '' ) { + // Only record each page once! + $pageSet[$title->getPrefixedText()] = true; + } + } + + // Set of original pages to pass on to further manipulation... + $inputPages = array_keys( $pageSet ); + + // Look up any linked pages if asked... + if ( $this->templates ) { + $pageSet = $this->getTemplates( $inputPages, $pageSet ); + } + $linkDepth = $this->pageLinkDepth; + if ( $linkDepth ) { + $pageSet = $this->getPageLinks( $inputPages, $pageSet, $linkDepth ); + } + + $pages = array_keys( $pageSet ); + + // Normalize titles to the same format and remove dupes, see T19374 + foreach ( $pages as $k => $v ) { + $pages[$k] = str_replace( " ", "_", $v ); + } + + $pages = array_unique( $pages ); + } + + /* Ok, let's get to it... */ + if ( $history == WikiExporter::CURRENT ) { + $lb = false; + $db = wfGetDB( DB_REPLICA ); + $buffer = WikiExporter::BUFFER; + } else { + // Use an unbuffered query; histories may be very long! + $lb = MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->newMainLB(); + $db = $lb->getConnection( DB_REPLICA ); + $buffer = WikiExporter::STREAM; + + // This might take a while... :D + Wikimedia\suppressWarnings(); + set_time_limit( 0 ); + Wikimedia\restoreWarnings(); + } + + $exporter = new WikiExporter( $db, $history, $buffer ); + $exporter->list_authors = $list_authors; + $exporter->openStream(); + + if ( $exportall ) { + $exporter->allPages(); + } else { + foreach ( $pages as $page ) { + # T10824: Only export pages the user can read + $title = Title::newFromText( $page ); + if ( is_null( $title ) ) { + // @todo Perhaps output an <error> tag or something. + continue; + } + + if ( !$title->userCan( 'read', $this->getUser() ) ) { + // @todo Perhaps output an <error> tag or something. + continue; + } + + $exporter->pageByTitle( $title ); + } + } + + $exporter->closeStream(); + + if ( $lb ) { + $lb->closeAll(); + } + } + + /** + * @param Title $title + * @return array + */ + private function getPagesFromCategory( $title ) { + global $wgContLang; + + $maxPages = $this->getConfig()->get( 'ExportPagelistLimit' ); + + $name = $title->getDBkey(); + + $dbr = wfGetDB( DB_REPLICA ); + $res = $dbr->select( + [ 'page', 'categorylinks' ], + [ 'page_namespace', 'page_title' ], + [ 'cl_from=page_id', 'cl_to' => $name ], + __METHOD__, + [ 'LIMIT' => $maxPages ] + ); + + $pages = []; + + foreach ( $res as $row ) { + $n = $row->page_title; + if ( $row->page_namespace ) { + $ns = $wgContLang->getNsText( $row->page_namespace ); + $n = $ns . ':' . $n; + } + + $pages[] = $n; + } + + return $pages; + } + + /** + * @param int $nsindex + * @return array + */ + private function getPagesFromNamespace( $nsindex ) { + global $wgContLang; + + $maxPages = $this->getConfig()->get( 'ExportPagelistLimit' ); + + $dbr = wfGetDB( DB_REPLICA ); + $res = $dbr->select( + 'page', + [ 'page_namespace', 'page_title' ], + [ 'page_namespace' => $nsindex ], + __METHOD__, + [ 'LIMIT' => $maxPages ] + ); + + $pages = []; + + foreach ( $res as $row ) { + $n = $row->page_title; + + if ( $row->page_namespace ) { + $ns = $wgContLang->getNsText( $row->page_namespace ); + $n = $ns . ':' . $n; + } + + $pages[] = $n; + } + + return $pages; + } + + /** + * Expand a list of pages to include templates used in those pages. + * @param array $inputPages List of titles to look up + * @param array $pageSet Associative array indexed by titles for output + * @return array Associative array index by titles + */ + private function getTemplates( $inputPages, $pageSet ) { + return $this->getLinks( $inputPages, $pageSet, + 'templatelinks', + [ 'namespace' => 'tl_namespace', 'title' => 'tl_title' ], + [ 'page_id=tl_from' ] + ); + } + + /** + * Validate link depth setting, if available. + * @param int $depth + * @return int + */ + private function validateLinkDepth( $depth ) { + if ( $depth < 0 ) { + return 0; + } + + if ( !$this->userCanOverrideExportDepth() ) { + $maxLinkDepth = $this->getConfig()->get( 'ExportMaxLinkDepth' ); + if ( $depth > $maxLinkDepth ) { + return $maxLinkDepth; + } + } + + /* + * There's a HARD CODED limit of 5 levels of recursion here to prevent a + * crazy-big export from being done by someone setting the depth + * number too high. In other words, last resort safety net. + */ + + return intval( min( $depth, 5 ) ); + } + + /** + * Expand a list of pages to include pages linked to from that page. + * @param array $inputPages + * @param array $pageSet + * @param int $depth + * @return array + */ + private function getPageLinks( $inputPages, $pageSet, $depth ) { + for ( ; $depth > 0; --$depth ) { + $pageSet = $this->getLinks( + $inputPages, $pageSet, 'pagelinks', + [ 'namespace' => 'pl_namespace', 'title' => 'pl_title' ], + [ 'page_id=pl_from' ] + ); + $inputPages = array_keys( $pageSet ); + } + + return $pageSet; + } + + /** + * Expand a list of pages to include items used in those pages. + * @param array $inputPages Array of page titles + * @param array $pageSet + * @param string $table + * @param array $fields Array of field names + * @param array $join + * @return array + */ + private function getLinks( $inputPages, $pageSet, $table, $fields, $join ) { + $dbr = wfGetDB( DB_REPLICA ); + + foreach ( $inputPages as $page ) { + $title = Title::newFromText( $page ); + + if ( $title ) { + $pageSet[$title->getPrefixedText()] = true; + /// @todo FIXME: May or may not be more efficient to batch these + /// by namespace when given multiple input pages. + $result = $dbr->select( + [ 'page', $table ], + $fields, + array_merge( + $join, + [ + 'page_namespace' => $title->getNamespace(), + 'page_title' => $title->getDBkey() + ] + ), + __METHOD__ + ); + + foreach ( $result as $row ) { + $template = Title::makeTitle( $row->namespace, $row->title ); + $pageSet[$template->getPrefixedText()] = true; + } + } + } + + return $pageSet; + } + + protected function getGroupName() { + return 'pagetools'; + } +} |