diff options
author | Yaco <franco@reevo.org> | 2020-06-04 11:01:00 -0300 |
---|---|---|
committer | Yaco <franco@reevo.org> | 2020-06-04 11:01:00 -0300 |
commit | fc7369835258467bf97eb64f184b93691f9a9fd5 (patch) | |
tree | daabd60089d2dd76d9f5fb416b005fbe159c799d /www/wiki/includes/export |
first commit
Diffstat (limited to 'www/wiki/includes/export')
-rw-r--r-- | www/wiki/includes/export/BaseDump.php | 219 | ||||
-rw-r--r-- | www/wiki/includes/export/Dump7ZipOutput.php | 76 | ||||
-rw-r--r-- | www/wiki/includes/export/DumpBZip2Output.php | 36 | ||||
-rw-r--r-- | www/wiki/includes/export/DumpDBZip2Output.php | 36 | ||||
-rw-r--r-- | www/wiki/includes/export/DumpFileOutput.php | 115 | ||||
-rw-r--r-- | www/wiki/includes/export/DumpFilter.php | 134 | ||||
-rw-r--r-- | www/wiki/includes/export/DumpGZipOutput.php | 36 | ||||
-rw-r--r-- | www/wiki/includes/export/DumpLatestFilter.php | 72 | ||||
-rw-r--r-- | www/wiki/includes/export/DumpMultiWriter.php | 113 | ||||
-rw-r--r-- | www/wiki/includes/export/DumpNamespaceFilter.php | 91 | ||||
-rw-r--r-- | www/wiki/includes/export/DumpNotalkFilter.php | 37 | ||||
-rw-r--r-- | www/wiki/includes/export/DumpOutput.php | 114 | ||||
-rw-r--r-- | www/wiki/includes/export/DumpPipeOutput.php | 102 | ||||
-rw-r--r-- | www/wiki/includes/export/DumpStringOutput.php | 45 | ||||
-rw-r--r-- | www/wiki/includes/export/ExportProgressFilter.php | 47 | ||||
-rw-r--r-- | www/wiki/includes/export/WikiExporter.php | 511 | ||||
-rw-r--r-- | www/wiki/includes/export/XmlDumpWriter.php | 449 |
17 files changed, 2233 insertions, 0 deletions
diff --git a/www/wiki/includes/export/BaseDump.php b/www/wiki/includes/export/BaseDump.php new file mode 100644 index 00000000..6a2d3bf6 --- /dev/null +++ b/www/wiki/includes/export/BaseDump.php @@ -0,0 +1,219 @@ +<?php +/** + * Helper class for the --prefetch option of dumpTextPass.php + * + * Copyright © 2005 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup Maintenance + */ + +/** + * Readahead helper for making large MediaWiki data dumps; + * reads in a previous XML dump to sequentially prefetch text + * records already normalized and decompressed. + * + * This can save load on the external database servers, hopefully. + * + * Assumes that dumps will be recorded in the canonical order: + * - ascending by page_id + * - ascending by rev_id within each page + * - text contents are immutable and should not change once + * recorded, so the previous dump is a reliable source + * + * @ingroup Maintenance + */ +class BaseDump { + /** @var XMLReader */ + protected $reader = null; + protected $atEnd = false; + protected $atPageEnd = false; + protected $lastPage = 0; + protected $lastRev = 0; + protected $infiles = null; + + public function __construct( $infile ) { + $this->infiles = explode( ';', $infile ); + $this->reader = new XMLReader(); + $infile = array_shift( $this->infiles ); + if ( defined( 'LIBXML_PARSEHUGE' ) ) { + $this->reader->open( $infile, null, LIBXML_PARSEHUGE ); + } else { + $this->reader->open( $infile ); + } + } + + /** + * Attempts to fetch the text of a particular page revision + * from the dump stream. May return null if the page is + * unavailable. + * + * @param int $page ID number of page to read + * @param int $rev ID number of revision to read + * @return string|null + */ + function prefetch( $page, $rev ) { + $page = intval( $page ); + $rev = intval( $rev ); + while ( $this->lastPage < $page && !$this->atEnd ) { + $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" ); + $this->nextPage(); + } + if ( $this->lastPage > $page || $this->atEnd ) { + $this->debug( "BaseDump::prefetch already past page $page " + . "looking for rev $rev [$this->lastPage, $this->lastRev]" ); + + return null; + } + while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) { + $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, " + . "looking for $page, $rev" ); + $this->nextRev(); + } + if ( $this->lastRev == $rev && !$this->atEnd ) { + $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" ); + + return $this->nextText(); + } else { + $this->debug( "BaseDump::prefetch already past rev $rev on page $page " + . "[$this->lastPage, $this->lastRev]" ); + + return null; + } + } + + function debug( $str ) { + wfDebug( $str . "\n" ); + // global $dumper; + // $dumper->progress( $str ); + } + + /** + * @access private + */ + function nextPage() { + if ( $this->skipTo( 'page', 'mediawiki' ) ) { + if ( $this->skipTo( 'id' ) ) { + $this->lastPage = intval( $this->nodeContents() ); + $this->lastRev = 0; + $this->atPageEnd = false; + } + } else { + $this->close(); + if ( count( $this->infiles ) ) { + $infile = array_shift( $this->infiles ); + $this->reader->open( $infile ); + $this->atEnd = false; + } + } + } + + /** + * @access private + */ + function nextRev() { + if ( $this->skipTo( 'revision' ) ) { + if ( $this->skipTo( 'id' ) ) { + $this->lastRev = intval( $this->nodeContents() ); + } + } else { + $this->atPageEnd = true; + } + } + + /** + * @access private + * @return string + */ + function nextText() { + $this->skipTo( 'text' ); + + return strval( $this->nodeContents() ); + } + + /** + * @access private + * @param string $name + * @param string $parent + * @return bool|null + */ + function skipTo( $name, $parent = 'page' ) { + if ( $this->atEnd ) { + return false; + } + while ( $this->reader->read() ) { + if ( $this->reader->nodeType == XMLReader::ELEMENT + && $this->reader->name == $name + ) { + return true; + } + if ( $this->reader->nodeType == XMLReader::END_ELEMENT + && $this->reader->name == $parent + ) { + $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" ); + + return false; + } + } + + return $this->close(); + } + + /** + * Shouldn't something like this be built-in to XMLReader? + * Fetches text contents of the current element, assuming + * no sub-elements or such scary things. + * + * @return string + * @access private + */ + function nodeContents() { + if ( $this->atEnd ) { + return null; + } + if ( $this->reader->isEmptyElement ) { + return ""; + } + $buffer = ""; + while ( $this->reader->read() ) { + switch ( $this->reader->nodeType ) { + case XMLReader::TEXT: + // case XMLReader::WHITESPACE: + case XMLReader::SIGNIFICANT_WHITESPACE: + $buffer .= $this->reader->value; + break; + case XMLReader::END_ELEMENT: + return $buffer; + } + } + + return $this->close(); + } + + /** + * @access private + * @return null + */ + function close() { + $this->reader->close(); + $this->atEnd = true; + + return null; + } +} diff --git a/www/wiki/includes/export/Dump7ZipOutput.php b/www/wiki/includes/export/Dump7ZipOutput.php new file mode 100644 index 00000000..31c945c0 --- /dev/null +++ b/www/wiki/includes/export/Dump7ZipOutput.php @@ -0,0 +1,76 @@ +<?php +/** + * Sends dump output via the p7zip compressor. + * + * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class Dump7ZipOutput extends DumpPipeOutput { + /** + * @var int + */ + protected $compressionLevel; + + /** + * @param string $file + * @param int $cmpLevel Compression level passed to 7za command's -mx + */ + function __construct( $file, $cmpLevel = 4 ) { + $this->compressionLevel = $cmpLevel; + $command = $this->setup7zCommand( $file ); + parent::__construct( $command ); + $this->filename = $file; + } + + /** + * @param string $file + * @return string + */ + function setup7zCommand( $file ) { + $command = "7za a -bd -si -mx="; + $command .= wfEscapeShellArg( $this->compressionLevel ) . ' '; + $command .= wfEscapeShellArg( $file ); + // Suppress annoying useless crap from p7zip + // Unfortunately this could suppress real error messages too + $command .= ' >' . wfGetNull() . ' 2>&1'; + return $command; + } + + /** + * @param string $newname + * @param bool $open + */ + function closeAndRename( $newname, $open = false ) { + $newname = $this->checkRenameArgCount( $newname ); + if ( $newname ) { + fclose( $this->handle ); + proc_close( $this->procOpenResource ); + $this->renameOrException( $newname ); + if ( $open ) { + $command = $this->setup7zCommand( $this->filename ); + $this->startCommand( $command ); + } + } + } +} diff --git a/www/wiki/includes/export/DumpBZip2Output.php b/www/wiki/includes/export/DumpBZip2Output.php new file mode 100644 index 00000000..bbc1c11f --- /dev/null +++ b/www/wiki/includes/export/DumpBZip2Output.php @@ -0,0 +1,36 @@ +<?php +/** + * Sends dump output via the bgzip2 compressor. + * + * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class DumpBZip2Output extends DumpPipeOutput { + /** + * @param string $file + */ + function __construct( $file ) { + parent::__construct( "bzip2", $file ); + } +} diff --git a/www/wiki/includes/export/DumpDBZip2Output.php b/www/wiki/includes/export/DumpDBZip2Output.php new file mode 100644 index 00000000..5edde8f7 --- /dev/null +++ b/www/wiki/includes/export/DumpDBZip2Output.php @@ -0,0 +1,36 @@ +<?php +/** + * Sends dump output via the bgzip2 compressor. + * + * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class DumpDBZip2Output extends DumpPipeOutput { + /** + * @param string $file + */ + function __construct( $file ) { + parent::__construct( "dbzip2", $file ); + } +} diff --git a/www/wiki/includes/export/DumpFileOutput.php b/www/wiki/includes/export/DumpFileOutput.php new file mode 100644 index 00000000..4bec7d45 --- /dev/null +++ b/www/wiki/includes/export/DumpFileOutput.php @@ -0,0 +1,115 @@ +<?php +/** + * Stream outputter to send data to a file. + * + * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class DumpFileOutput extends DumpOutput { + protected $handle = false, $filename; + + /** + * @param string $file + */ + function __construct( $file ) { + $this->handle = fopen( $file, "wt" ); + $this->filename = $file; + } + + /** + * @param string $string + */ + function writeCloseStream( $string ) { + parent::writeCloseStream( $string ); + if ( $this->handle ) { + fclose( $this->handle ); + $this->handle = false; + } + } + + /** + * @param string $string + */ + function write( $string ) { + fputs( $this->handle, $string ); + } + + /** + * @param string $newname + */ + function closeRenameAndReopen( $newname ) { + $this->closeAndRename( $newname, true ); + } + + /** + * @param string $newname + * @throws MWException + */ + function renameOrException( $newname ) { + if ( !rename( $this->filename, $newname ) ) { + throw new MWException( __METHOD__ . ": rename of file {$this->filename} to $newname failed\n" ); + } + } + + /** + * @param array $newname + * @return string + * @throws MWException + */ + function checkRenameArgCount( $newname ) { + if ( is_array( $newname ) ) { + if ( count( $newname ) > 1 ) { + throw new MWException( __METHOD__ . ": passed multiple arguments for rename of single file\n" ); + } else { + $newname = $newname[0]; + } + } + return $newname; + } + + /** + * @param string $newname + * @param bool $open + */ + function closeAndRename( $newname, $open = false ) { + $newname = $this->checkRenameArgCount( $newname ); + if ( $newname ) { + if ( $this->handle ) { + fclose( $this->handle ); + $this->handle = false; + } + $this->renameOrException( $newname ); + if ( $open ) { + $this->handle = fopen( $this->filename, "wt" ); + } + } + } + + /** + * @return string|null + */ + function getFilenames() { + return $this->filename; + } +} diff --git a/www/wiki/includes/export/DumpFilter.php b/www/wiki/includes/export/DumpFilter.php new file mode 100644 index 00000000..1349c54b --- /dev/null +++ b/www/wiki/includes/export/DumpFilter.php @@ -0,0 +1,134 @@ +<?php +/** + * Dump output filter class. + * This just does output filtering and streaming; XML formatting is done + * higher up, so be careful in what you do. + * + * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class DumpFilter { + /** + * @var DumpOutput + * FIXME will need to be made protected whenever legacy code + * is updated. + */ + public $sink; + + /** + * @var bool + */ + protected $sendingThisPage; + + /** + * @param DumpOutput &$sink + */ + function __construct( &$sink ) { + $this->sink =& $sink; + } + + /** + * @param string $string + */ + function writeOpenStream( $string ) { + $this->sink->writeOpenStream( $string ); + } + + /** + * @param string $string + */ + function writeCloseStream( $string ) { + $this->sink->writeCloseStream( $string ); + } + + /** + * @param object $page + * @param string $string + */ + function writeOpenPage( $page, $string ) { + $this->sendingThisPage = $this->pass( $page, $string ); + if ( $this->sendingThisPage ) { + $this->sink->writeOpenPage( $page, $string ); + } + } + + /** + * @param string $string + */ + function writeClosePage( $string ) { + if ( $this->sendingThisPage ) { + $this->sink->writeClosePage( $string ); + $this->sendingThisPage = false; + } + } + + /** + * @param object $rev + * @param string $string + */ + function writeRevision( $rev, $string ) { + if ( $this->sendingThisPage ) { + $this->sink->writeRevision( $rev, $string ); + } + } + + /** + * @param object $rev + * @param string $string + */ + function writeLogItem( $rev, $string ) { + $this->sink->writeRevision( $rev, $string ); + } + + /** + * @param string $newname + */ + function closeRenameAndReopen( $newname ) { + $this->sink->closeRenameAndReopen( $newname ); + } + + /** + * @param string $newname + * @param bool $open + */ + function closeAndRename( $newname, $open = false ) { + $this->sink->closeAndRename( $newname, $open ); + } + + /** + * @return array + */ + function getFilenames() { + return $this->sink->getFilenames(); + } + + /** + * Override for page-based filter types. + * @param object $page + * @return bool + */ + function pass( $page ) { + return true; + } +} diff --git a/www/wiki/includes/export/DumpGZipOutput.php b/www/wiki/includes/export/DumpGZipOutput.php new file mode 100644 index 00000000..d9e74a79 --- /dev/null +++ b/www/wiki/includes/export/DumpGZipOutput.php @@ -0,0 +1,36 @@ +<?php +/** + * Sends dump output via the gzip compressor. + * + * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class DumpGZipOutput extends DumpPipeOutput { + /** + * @param string $file + */ + function __construct( $file ) { + parent::__construct( "gzip", $file ); + } +} diff --git a/www/wiki/includes/export/DumpLatestFilter.php b/www/wiki/includes/export/DumpLatestFilter.php new file mode 100644 index 00000000..d3742b73 --- /dev/null +++ b/www/wiki/includes/export/DumpLatestFilter.php @@ -0,0 +1,72 @@ +<?php +/** + * Dump output filter to include only the last revision in each page sequence. + * + * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class DumpLatestFilter extends DumpFilter { + public $page; + + public $pageString; + + public $rev; + + public $revString; + + /** + * @param object $page + * @param string $string + */ + function writeOpenPage( $page, $string ) { + $this->page = $page; + $this->pageString = $string; + } + + /** + * @param string $string + */ + function writeClosePage( $string ) { + if ( $this->rev ) { + $this->sink->writeOpenPage( $this->page, $this->pageString ); + $this->sink->writeRevision( $this->rev, $this->revString ); + $this->sink->writeClosePage( $string ); + } + $this->rev = null; + $this->revString = null; + $this->page = null; + $this->pageString = null; + } + + /** + * @param object $rev + * @param string $string + */ + function writeRevision( $rev, $string ) { + if ( $rev->rev_id == $this->page->page_latest ) { + $this->rev = $rev; + $this->revString = $string; + } + } +} diff --git a/www/wiki/includes/export/DumpMultiWriter.php b/www/wiki/includes/export/DumpMultiWriter.php new file mode 100644 index 00000000..92118fe4 --- /dev/null +++ b/www/wiki/includes/export/DumpMultiWriter.php @@ -0,0 +1,113 @@ +<?php +/** + * Base class for output stream; prints to stdout or buffer or wherever. + * + * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class DumpMultiWriter { + + /** + * @param array $sinks + */ + function __construct( $sinks ) { + $this->sinks = $sinks; + $this->count = count( $sinks ); + } + + /** + * @param string $string + */ + function writeOpenStream( $string ) { + for ( $i = 0; $i < $this->count; $i++ ) { + $this->sinks[$i]->writeOpenStream( $string ); + } + } + + /** + * @param string $string + */ + function writeCloseStream( $string ) { + for ( $i = 0; $i < $this->count; $i++ ) { + $this->sinks[$i]->writeCloseStream( $string ); + } + } + + /** + * @param object $page + * @param string $string + */ + function writeOpenPage( $page, $string ) { + for ( $i = 0; $i < $this->count; $i++ ) { + $this->sinks[$i]->writeOpenPage( $page, $string ); + } + } + + /** + * @param string $string + */ + function writeClosePage( $string ) { + for ( $i = 0; $i < $this->count; $i++ ) { + $this->sinks[$i]->writeClosePage( $string ); + } + } + + /** + * @param object $rev + * @param string $string + */ + function writeRevision( $rev, $string ) { + for ( $i = 0; $i < $this->count; $i++ ) { + $this->sinks[$i]->writeRevision( $rev, $string ); + } + } + + /** + * @param array $newnames + */ + function closeRenameAndReopen( $newnames ) { + $this->closeAndRename( $newnames, true ); + } + + /** + * @param array $newnames + * @param bool $open + */ + function closeAndRename( $newnames, $open = false ) { + for ( $i = 0; $i < $this->count; $i++ ) { + $this->sinks[$i]->closeAndRename( $newnames[$i], $open ); + } + } + + /** + * @return array + */ + function getFilenames() { + $filenames = []; + for ( $i = 0; $i < $this->count; $i++ ) { + $filenames[] = $this->sinks[$i]->getFilenames(); + } + return $filenames; + } +} diff --git a/www/wiki/includes/export/DumpNamespaceFilter.php b/www/wiki/includes/export/DumpNamespaceFilter.php new file mode 100644 index 00000000..12b9b55e --- /dev/null +++ b/www/wiki/includes/export/DumpNamespaceFilter.php @@ -0,0 +1,91 @@ +<?php +/** + * Dump output filter to include or exclude pages in a given set of namespaces. + * + * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class DumpNamespaceFilter extends DumpFilter { + /** @var bool */ + public $invert = false; + + /** @var array */ + public $namespaces = []; + + /** + * @param DumpOutput &$sink + * @param array $param + * @throws MWException + */ + function __construct( &$sink, $param ) { + parent::__construct( $sink ); + + $constants = [ + "NS_MAIN" => NS_MAIN, + "NS_TALK" => NS_TALK, + "NS_USER" => NS_USER, + "NS_USER_TALK" => NS_USER_TALK, + "NS_PROJECT" => NS_PROJECT, + "NS_PROJECT_TALK" => NS_PROJECT_TALK, + "NS_FILE" => NS_FILE, + "NS_FILE_TALK" => NS_FILE_TALK, + "NS_IMAGE" => NS_FILE, // NS_IMAGE is an alias for NS_FILE + "NS_IMAGE_TALK" => NS_FILE_TALK, + "NS_MEDIAWIKI" => NS_MEDIAWIKI, + "NS_MEDIAWIKI_TALK" => NS_MEDIAWIKI_TALK, + "NS_TEMPLATE" => NS_TEMPLATE, + "NS_TEMPLATE_TALK" => NS_TEMPLATE_TALK, + "NS_HELP" => NS_HELP, + "NS_HELP_TALK" => NS_HELP_TALK, + "NS_CATEGORY" => NS_CATEGORY, + "NS_CATEGORY_TALK" => NS_CATEGORY_TALK ]; + + if ( $param { 0 } == '!' ) { + $this->invert = true; + $param = substr( $param, 1 ); + } + + foreach ( explode( ',', $param ) as $key ) { + $key = trim( $key ); + if ( isset( $constants[$key] ) ) { + $ns = $constants[$key]; + $this->namespaces[$ns] = true; + } elseif ( is_numeric( $key ) ) { + $ns = intval( $key ); + $this->namespaces[$ns] = true; + } else { + throw new MWException( "Unrecognized namespace key '$key'\n" ); + } + } + } + + /** + * @param object $page + * @return bool + */ + function pass( $page ) { + $match = isset( $this->namespaces[$page->page_namespace] ); + return $this->invert xor $match; + } +} diff --git a/www/wiki/includes/export/DumpNotalkFilter.php b/www/wiki/includes/export/DumpNotalkFilter.php new file mode 100644 index 00000000..d99b1b1d --- /dev/null +++ b/www/wiki/includes/export/DumpNotalkFilter.php @@ -0,0 +1,37 @@ +<?php +/** + * Simple dump output filter to exclude all talk pages. + * + * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class DumpNotalkFilter extends DumpFilter { + /** + * @param object $page + * @return bool + */ + function pass( $page ) { + return !MWNamespace::isTalk( $page->page_namespace ); + } +} diff --git a/www/wiki/includes/export/DumpOutput.php b/www/wiki/includes/export/DumpOutput.php new file mode 100644 index 00000000..edd73fcf --- /dev/null +++ b/www/wiki/includes/export/DumpOutput.php @@ -0,0 +1,114 @@ +<?php +/** + * Base class for output stream; prints to stdout or buffer or wherever. + * + * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class DumpOutput { + + /** + * @param string $string + */ + function writeOpenStream( $string ) { + $this->write( $string ); + } + + /** + * @param string $string + */ + function writeCloseStream( $string ) { + $this->write( $string ); + } + + /** + * @param object $page + * @param string $string + */ + function writeOpenPage( $page, $string ) { + $this->write( $string ); + } + + /** + * @param string $string + */ + function writeClosePage( $string ) { + $this->write( $string ); + } + + /** + * @param object $rev + * @param string $string + */ + function writeRevision( $rev, $string ) { + $this->write( $string ); + } + + /** + * @param object $rev + * @param string $string + */ + function writeLogItem( $rev, $string ) { + $this->write( $string ); + } + + /** + * Override to write to a different stream type. + * @param string $string + * @return bool + */ + function write( $string ) { + print $string; + } + + /** + * Close the old file, move it to a specified name, + * and reopen new file with the old name. Use this + * for writing out a file in multiple pieces + * at specified checkpoints (e.g. every n hours). + * @param string|array $newname File name. May be a string or an array with one element + */ + function closeRenameAndReopen( $newname ) { + } + + /** + * Close the old file, and move it to a specified name. + * Use this for the last piece of a file written out + * at specified checkpoints (e.g. every n hours). + * @param string|array $newname File name. May be a string or an array with one element + * @param bool $open If true, a new file with the old filename will be opened + * again for writing (default: false) + */ + function closeAndRename( $newname, $open = false ) { + } + + /** + * Returns the name of the file or files which are + * being written to, if there are any. + * @return null + */ + function getFilenames() { + return null; + } +} diff --git a/www/wiki/includes/export/DumpPipeOutput.php b/www/wiki/includes/export/DumpPipeOutput.php new file mode 100644 index 00000000..ce899ed3 --- /dev/null +++ b/www/wiki/includes/export/DumpPipeOutput.php @@ -0,0 +1,102 @@ +<?php +/** + * Stream outputter to send data to a file via some filter program. + * Even if compression is available in a library, using a separate + * program can allow us to make use of a multi-processor system. + * + * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class DumpPipeOutput extends DumpFileOutput { + protected $command, $filename; + protected $procOpenResource = false; + + /** + * @param string $command + * @param string $file + */ + function __construct( $command, $file = null ) { + if ( !is_null( $file ) ) { + $command .= " > " . wfEscapeShellArg( $file ); + } + + $this->startCommand( $command ); + $this->command = $command; + $this->filename = $file; + } + + /** + * @param string $string + */ + function writeCloseStream( $string ) { + parent::writeCloseStream( $string ); + if ( $this->procOpenResource ) { + proc_close( $this->procOpenResource ); + $this->procOpenResource = false; + } + } + + /** + * @param string $command + */ + function startCommand( $command ) { + $spec = [ + 0 => [ "pipe", "r" ], + ]; + $pipes = []; + $this->procOpenResource = proc_open( $command, $spec, $pipes ); + $this->handle = $pipes[0]; + } + + /** + * @param string $newname + */ + function closeRenameAndReopen( $newname ) { + $this->closeAndRename( $newname, true ); + } + + /** + * @param string $newname + * @param bool $open + */ + function closeAndRename( $newname, $open = false ) { + $newname = $this->checkRenameArgCount( $newname ); + if ( $newname ) { + if ( $this->handle ) { + fclose( $this->handle ); + $this->handle = false; + } + if ( $this->procOpenResource ) { + proc_close( $this->procOpenResource ); + $this->procOpenResource = false; + } + $this->renameOrException( $newname ); + if ( $open ) { + $command = $this->command; + $command .= " > " . wfEscapeShellArg( $this->filename ); + $this->startCommand( $command ); + } + } + } +} diff --git a/www/wiki/includes/export/DumpStringOutput.php b/www/wiki/includes/export/DumpStringOutput.php new file mode 100644 index 00000000..837a62d6 --- /dev/null +++ b/www/wiki/includes/export/DumpStringOutput.php @@ -0,0 +1,45 @@ +<?php +/** + * Stream outputter that buffers and returns data as a string. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + * @since 1.28 + */ +class DumpStringOutput extends DumpOutput { + private $output = ''; + + /** + * @param string $string + */ + function write( $string ) { + $this->output .= $string; + } + + /** + * Get the string containing the output. + * + * @return string + */ + public function __toString() { + return $this->output; + } +} diff --git a/www/wiki/includes/export/ExportProgressFilter.php b/www/wiki/includes/export/ExportProgressFilter.php new file mode 100644 index 00000000..9b1571f7 --- /dev/null +++ b/www/wiki/includes/export/ExportProgressFilter.php @@ -0,0 +1,47 @@ +<?php +/** + * Copyright © 2005 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class ExportProgressFilter extends DumpFilter { + /** + * @var BackupDumper + */ + private $progress; + + function __construct( &$sink, &$progress ) { + parent::__construct( $sink ); + $this->progress = $progress; + } + + function writeClosePage( $string ) { + parent::writeClosePage( $string ); + $this->progress->reportPage(); + } + + function writeRevision( $rev, $string ) { + parent::writeRevision( $rev, $string ); + $this->progress->revCount(); + } +} diff --git a/www/wiki/includes/export/WikiExporter.php b/www/wiki/includes/export/WikiExporter.php new file mode 100644 index 00000000..6c7a4493 --- /dev/null +++ b/www/wiki/includes/export/WikiExporter.php @@ -0,0 +1,511 @@ +<?php +/** + * Base class for exporting + * + * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @defgroup Dump Dump + */ + +use Wikimedia\Rdbms\ResultWrapper; +use Wikimedia\Rdbms\IDatabase; + +/** + * @ingroup SpecialPage Dump + */ +class WikiExporter { + /** @var bool Return distinct author list (when not returning full history) */ + public $list_authors = false; + + /** @var bool */ + public $dumpUploads = false; + + /** @var bool */ + public $dumpUploadFileContents = false; + + /** @var string */ + public $author_list = ""; + + const FULL = 1; + const CURRENT = 2; + const STABLE = 4; // extension defined + const LOGS = 8; + const RANGE = 16; + + const BUFFER = 0; + const STREAM = 1; + + const TEXT = 0; + const STUB = 1; + + /** @var int */ + public $buffer; + + /** @var int */ + public $text; + + /** @var DumpOutput */ + public $sink; + + /** + * Returns the export schema version. + * @return string + */ + public static function schemaVersion() { + return "0.10"; + } + + /** + * If using WikiExporter::STREAM to stream a large amount of data, + * provide a database connection which is not managed by + * LoadBalancer to read from: some history blob types will + * make additional queries to pull source data while the + * main query is still running. + * + * @param IDatabase $db + * @param int|array $history One of WikiExporter::FULL, WikiExporter::CURRENT, + * WikiExporter::RANGE or WikiExporter::STABLE, or an associative array: + * - offset: non-inclusive offset at which to start the query + * - limit: maximum number of rows to return + * - dir: "asc" or "desc" timestamp order + * @param int $buffer One of WikiExporter::BUFFER or WikiExporter::STREAM + * @param int $text One of WikiExporter::TEXT or WikiExporter::STUB + */ + function __construct( $db, $history = self::CURRENT, + $buffer = self::BUFFER, $text = self::TEXT ) { + $this->db = $db; + $this->history = $history; + $this->buffer = $buffer; + $this->writer = new XmlDumpWriter(); + $this->sink = new DumpOutput(); + $this->text = $text; + } + + /** + * Set the DumpOutput or DumpFilter object which will receive + * various row objects and XML output for filtering. Filters + * can be chained or used as callbacks. + * + * @param DumpOutput &$sink + */ + public function setOutputSink( &$sink ) { + $this->sink =& $sink; + } + + public function openStream() { + $output = $this->writer->openStream(); + $this->sink->writeOpenStream( $output ); + } + + public function closeStream() { + $output = $this->writer->closeStream(); + $this->sink->writeCloseStream( $output ); + } + + /** + * Dumps a series of page and revision records for all pages + * in the database, either including complete history or only + * the most recent version. + */ + public function allPages() { + $this->dumpFrom( '' ); + } + + /** + * Dumps a series of page and revision records for those pages + * in the database falling within the page_id range given. + * @param int $start Inclusive lower limit (this id is included) + * @param int $end Exclusive upper limit (this id is not included) + * If 0, no upper limit. + * @param bool $orderRevs order revisions within pages in ascending order + */ + public function pagesByRange( $start, $end, $orderRevs ) { + if ( $orderRevs ) { + $condition = 'rev_page >= ' . intval( $start ); + if ( $end ) { + $condition .= ' AND rev_page < ' . intval( $end ); + } + } else { + $condition = 'page_id >= ' . intval( $start ); + if ( $end ) { + $condition .= ' AND page_id < ' . intval( $end ); + } + } + $this->dumpFrom( $condition, $orderRevs ); + } + + /** + * Dumps a series of page and revision records for those pages + * in the database with revisions falling within the rev_id range given. + * @param int $start Inclusive lower limit (this id is included) + * @param int $end Exclusive upper limit (this id is not included) + * If 0, no upper limit. + */ + public function revsByRange( $start, $end ) { + $condition = 'rev_id >= ' . intval( $start ); + if ( $end ) { + $condition .= ' AND rev_id < ' . intval( $end ); + } + $this->dumpFrom( $condition ); + } + + /** + * @param Title $title + */ + public function pageByTitle( $title ) { + $this->dumpFrom( + 'page_namespace=' . $title->getNamespace() . + ' AND page_title=' . $this->db->addQuotes( $title->getDBkey() ) ); + } + + /** + * @param string $name + * @throws MWException + */ + public function pageByName( $name ) { + $title = Title::newFromText( $name ); + if ( is_null( $title ) ) { + throw new MWException( "Can't export invalid title" ); + } else { + $this->pageByTitle( $title ); + } + } + + /** + * @param array $names + */ + public function pagesByName( $names ) { + foreach ( $names as $name ) { + $this->pageByName( $name ); + } + } + + public function allLogs() { + $this->dumpFrom( '' ); + } + + /** + * @param int $start + * @param int $end + */ + public function logsByRange( $start, $end ) { + $condition = 'log_id >= ' . intval( $start ); + if ( $end ) { + $condition .= ' AND log_id < ' . intval( $end ); + } + $this->dumpFrom( $condition ); + } + + /** + * Generates the distinct list of authors of an article + * Not called by default (depends on $this->list_authors) + * Can be set by Special:Export when not exporting whole history + * + * @param array $cond + */ + protected function do_list_authors( $cond ) { + $this->author_list = "<contributors>"; + // rev_deleted + + $revQuery = Revision::getQueryInfo( [ 'page' ] ); + $res = $this->db->select( + $revQuery['tables'], + [ + 'rev_user_text' => $revQuery['fields']['rev_user_text'], + 'rev_user' => $revQuery['fields']['rev_user'], + ], + [ + $this->db->bitAnd( 'rev_deleted', Revision::DELETED_USER ) . ' = 0', + $cond, + ], + __METHOD__, + [ 'DISTINCT' ], + $revQuery['joins'] + ); + + foreach ( $res as $row ) { + $this->author_list .= "<contributor>" . + "<username>" . + htmlentities( $row->rev_user_text ) . + "</username>" . + "<id>" . + $row->rev_user . + "</id>" . + "</contributor>"; + } + $this->author_list .= "</contributors>"; + } + + /** + * @param string $cond + * @param bool $orderRevs + * @throws MWException + * @throws Exception + */ + protected function dumpFrom( $cond = '', $orderRevs = false ) { + # For logging dumps... + if ( $this->history & self::LOGS ) { + $where = []; + # Hide private logs + $hideLogs = LogEventsList::getExcludeClause( $this->db ); + if ( $hideLogs ) { + $where[] = $hideLogs; + } + # Add on any caller specified conditions + if ( $cond ) { + $where[] = $cond; + } + # Get logging table name for logging.* clause + $logging = $this->db->tableName( 'logging' ); + + if ( $this->buffer == self::STREAM ) { + $prev = $this->db->bufferResults( false ); + } + $result = null; // Assuring $result is not undefined, if exception occurs early + + $commentQuery = CommentStore::getStore()->getJoin( 'log_comment' ); + $actorQuery = ActorMigration::newMigration()->getJoin( 'log_user' ); + + try { + $result = $this->db->select( + array_merge( [ 'logging' ], $commentQuery['tables'], $actorQuery['tables'], [ 'user' ] ), + [ "{$logging}.*", 'user_name' ] + $commentQuery['fields'] + $actorQuery['fields'], + $where, + __METHOD__, + [ 'ORDER BY' => 'log_id', 'USE INDEX' => [ 'logging' => 'PRIMARY' ] ], + [ + 'user' => [ 'JOIN', 'user_id = ' . $actorQuery['fields']['log_user'] ] + ] + $commentQuery['joins'] + $actorQuery['joins'] + ); + $this->outputLogStream( $result ); + if ( $this->buffer == self::STREAM ) { + $this->db->bufferResults( $prev ); + } + } catch ( Exception $e ) { + // Throwing the exception does not reliably free the resultset, and + // would also leave the connection in unbuffered mode. + + // Freeing result + try { + if ( $result ) { + $result->free(); + } + } catch ( Exception $e2 ) { + // Already in panic mode -> ignoring $e2 as $e has + // higher priority + } + + // Putting database back in previous buffer mode + try { + if ( $this->buffer == self::STREAM ) { + $this->db->bufferResults( $prev ); + } + } catch ( Exception $e2 ) { + // Already in panic mode -> ignoring $e2 as $e has + // higher priority + } + + // Inform caller about problem + throw $e; + } + # For page dumps... + } else { + $revOpts = [ 'page' ]; + if ( $this->text != self::STUB ) { + $revOpts[] = 'text'; + } + $revQuery = Revision::getQueryInfo( $revOpts ); + + // We want page primary rather than revision + $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) ); + $join = $revQuery['joins'] + [ + 'revision' => $revQuery['joins']['page'] + ]; + unset( $join['page'] ); + + $fields = array_merge( $revQuery['fields'], [ 'page_restrictions' ] ); + + $conds = []; + if ( $cond !== '' ) { + $conds[] = $cond; + } + $opts = [ 'ORDER BY' => 'page_id ASC' ]; + $opts['USE INDEX'] = []; + if ( is_array( $this->history ) ) { + # Time offset/limit for all pages/history... + # Set time order + if ( $this->history['dir'] == 'asc' ) { + $op = '>'; + $opts['ORDER BY'] = 'rev_timestamp ASC'; + } else { + $op = '<'; + $opts['ORDER BY'] = 'rev_timestamp DESC'; + } + # Set offset + if ( !empty( $this->history['offset'] ) ) { + $conds[] = "rev_timestamp $op " . + $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) ); + } + # Set query limit + if ( !empty( $this->history['limit'] ) ) { + $opts['LIMIT'] = intval( $this->history['limit'] ); + } + } elseif ( $this->history & self::FULL ) { + # Full history dumps... + # query optimization for history stub dumps + if ( $this->text == self::STUB && $orderRevs ) { + $tables = $revQuery['tables']; + $opts['ORDER BY'] = [ 'rev_page ASC', 'rev_id ASC' ]; + $opts['USE INDEX']['revision'] = 'rev_page_id'; + unset( $join['revision'] ); + $join['page'] = [ 'INNER JOIN', 'rev_page=page_id' ]; + } + } elseif ( $this->history & self::CURRENT ) { + # Latest revision dumps... + if ( $this->list_authors && $cond != '' ) { // List authors, if so desired + $this->do_list_authors( $cond ); + } + $join['revision'] = [ 'INNER JOIN', 'page_id=rev_page AND page_latest=rev_id' ]; + } elseif ( $this->history & self::STABLE ) { + # "Stable" revision dumps... + # Default JOIN, to be overridden... + $join['revision'] = [ 'INNER JOIN', 'page_id=rev_page AND page_latest=rev_id' ]; + # One, and only one hook should set this, and return false + if ( Hooks::run( 'WikiExporter::dumpStableQuery', [ &$tables, &$opts, &$join ] ) ) { + throw new MWException( __METHOD__ . " given invalid history dump type." ); + } + } elseif ( $this->history & self::RANGE ) { + # Dump of revisions within a specified range + $opts['ORDER BY'] = [ 'rev_page ASC', 'rev_id ASC' ]; + } else { + # Unknown history specification parameter? + throw new MWException( __METHOD__ . " given invalid history dump type." ); + } + + if ( $this->buffer == self::STREAM ) { + $prev = $this->db->bufferResults( false ); + } + $result = null; // Assuring $result is not undefined, if exception occurs early + try { + Hooks::run( 'ModifyExportQuery', + [ $this->db, &$tables, &$cond, &$opts, &$join ] ); + + # Do the query! + $result = $this->db->select( + $tables, + $fields, + $conds, + __METHOD__, + $opts, + $join + ); + # Output dump results + $this->outputPageStream( $result ); + + if ( $this->buffer == self::STREAM ) { + $this->db->bufferResults( $prev ); + } + } catch ( Exception $e ) { + // Throwing the exception does not reliably free the resultset, and + // would also leave the connection in unbuffered mode. + + // Freeing result + try { + if ( $result ) { + $result->free(); + } + } catch ( Exception $e2 ) { + // Already in panic mode -> ignoring $e2 as $e has + // higher priority + } + + // Putting database back in previous buffer mode + try { + if ( $this->buffer == self::STREAM ) { + $this->db->bufferResults( $prev ); + } + } catch ( Exception $e2 ) { + // Already in panic mode -> ignoring $e2 as $e has + // higher priority + } + + // Inform caller about problem + throw $e; + } + } + } + + /** + * Runs through a query result set dumping page and revision records. + * The result set should be sorted/grouped by page to avoid duplicate + * page records in the output. + * + * Should be safe for + * streaming (non-buffered) queries, as long as it was made on a + * separate database connection not managed by LoadBalancer; some + * blob storage types will make queries to pull source data. + * + * @param ResultWrapper $resultset + */ + protected function outputPageStream( $resultset ) { + $last = null; + foreach ( $resultset as $row ) { + if ( $last === null || + $last->page_namespace != $row->page_namespace || + $last->page_title != $row->page_title ) { + if ( $last !== null ) { + $output = ''; + if ( $this->dumpUploads ) { + $output .= $this->writer->writeUploads( $last, $this->dumpUploadFileContents ); + } + $output .= $this->writer->closePage(); + $this->sink->writeClosePage( $output ); + } + $output = $this->writer->openPage( $row ); + $this->sink->writeOpenPage( $row, $output ); + $last = $row; + } + $output = $this->writer->writeRevision( $row ); + $this->sink->writeRevision( $row, $output ); + } + if ( $last !== null ) { + $output = ''; + if ( $this->dumpUploads ) { + $output .= $this->writer->writeUploads( $last, $this->dumpUploadFileContents ); + } + $output .= $this->author_list; + $output .= $this->writer->closePage(); + $this->sink->writeClosePage( $output ); + } + } + + /** + * @param ResultWrapper $resultset + */ + protected function outputLogStream( $resultset ) { + foreach ( $resultset as $row ) { + $output = $this->writer->writeLogItem( $row ); + $this->sink->writeLogItem( $row, $output ); + } + } +} diff --git a/www/wiki/includes/export/XmlDumpWriter.php b/www/wiki/includes/export/XmlDumpWriter.php new file mode 100644 index 00000000..e1c12de1 --- /dev/null +++ b/www/wiki/includes/export/XmlDumpWriter.php @@ -0,0 +1,449 @@ +<?php +/** + * XmlDumpWriter + * + * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class XmlDumpWriter { + /** + * Opens the XML output stream's root "<mediawiki>" element. + * This does not include an xml directive, so is safe to include + * as a subelement in a larger XML stream. Namespace and XML Schema + * references are included. + * + * Output will be encoded in UTF-8. + * + * @return string + */ + function openStream() { + global $wgContLang; + $ver = WikiExporter::schemaVersion(); + return Xml::element( 'mediawiki', [ + 'xmlns' => "http://www.mediawiki.org/xml/export-$ver/", + 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance", + /* + * When a new version of the schema is created, it needs staging on mediawiki.org. + * This requires a change in the operations/mediawiki-config git repo. + * + * Create a changeset like https://gerrit.wikimedia.org/r/#/c/149643/ in which + * you copy in the new xsd file. + * + * After it is reviewed, merged and deployed (sync-docroot), the index.html needs purging. + * echo "https://www.mediawiki.org/xml/index.html" | mwscript purgeList.php --wiki=aawiki + */ + 'xsi:schemaLocation' => "http://www.mediawiki.org/xml/export-$ver/ " . + "http://www.mediawiki.org/xml/export-$ver.xsd", + 'version' => $ver, + 'xml:lang' => $wgContLang->getHtmlCode() ], + null ) . + "\n" . + $this->siteInfo(); + } + + /** + * @return string + */ + function siteInfo() { + $info = [ + $this->sitename(), + $this->dbname(), + $this->homelink(), + $this->generator(), + $this->caseSetting(), + $this->namespaces() ]; + return " <siteinfo>\n " . + implode( "\n ", $info ) . + "\n </siteinfo>\n"; + } + + /** + * @return string + */ + function sitename() { + global $wgSitename; + return Xml::element( 'sitename', [], $wgSitename ); + } + + /** + * @return string + */ + function dbname() { + global $wgDBname; + return Xml::element( 'dbname', [], $wgDBname ); + } + + /** + * @return string + */ + function generator() { + global $wgVersion; + return Xml::element( 'generator', [], "MediaWiki $wgVersion" ); + } + + /** + * @return string + */ + function homelink() { + return Xml::element( 'base', [], Title::newMainPage()->getCanonicalURL() ); + } + + /** + * @return string + */ + function caseSetting() { + global $wgCapitalLinks; + // "case-insensitive" option is reserved for future + $sensitivity = $wgCapitalLinks ? 'first-letter' : 'case-sensitive'; + return Xml::element( 'case', [], $sensitivity ); + } + + /** + * @return string + */ + function namespaces() { + global $wgContLang; + $spaces = "<namespaces>\n"; + foreach ( $wgContLang->getFormattedNamespaces() as $ns => $title ) { + $spaces .= ' ' . + Xml::element( 'namespace', + [ + 'key' => $ns, + 'case' => MWNamespace::isCapitalized( $ns ) ? 'first-letter' : 'case-sensitive', + ], $title ) . "\n"; + } + $spaces .= " </namespaces>"; + return $spaces; + } + + /** + * Closes the output stream with the closing root element. + * Call when finished dumping things. + * + * @return string + */ + function closeStream() { + return "</mediawiki>\n"; + } + + /** + * Opens a "<page>" section on the output stream, with data + * from the given database row. + * + * @param object $row + * @return string + */ + public function openPage( $row ) { + $out = " <page>\n"; + $title = Title::makeTitle( $row->page_namespace, $row->page_title ); + $out .= ' ' . Xml::elementClean( 'title', [], self::canonicalTitle( $title ) ) . "\n"; + $out .= ' ' . Xml::element( 'ns', [], strval( $row->page_namespace ) ) . "\n"; + $out .= ' ' . Xml::element( 'id', [], strval( $row->page_id ) ) . "\n"; + if ( $row->page_is_redirect ) { + $page = WikiPage::factory( $title ); + $redirect = $page->getRedirectTarget(); + if ( $redirect instanceof Title && $redirect->isValidRedirectTarget() ) { + $out .= ' '; + $out .= Xml::element( 'redirect', [ 'title' => self::canonicalTitle( $redirect ) ] ); + $out .= "\n"; + } + } + + if ( $row->page_restrictions != '' ) { + $out .= ' ' . Xml::element( 'restrictions', [], + strval( $row->page_restrictions ) ) . "\n"; + } + + Hooks::run( 'XmlDumpWriterOpenPage', [ $this, &$out, $row, $title ] ); + + return $out; + } + + /** + * Closes a "<page>" section on the output stream. + * + * @access private + * @return string + */ + function closePage() { + return " </page>\n"; + } + + /** + * Dumps a "<revision>" section on the output stream, with + * data filled in from the given database row. + * + * @param object $row + * @return string + * @access private + */ + function writeRevision( $row ) { + $out = " <revision>\n"; + $out .= " " . Xml::element( 'id', null, strval( $row->rev_id ) ) . "\n"; + if ( isset( $row->rev_parent_id ) && $row->rev_parent_id ) { + $out .= " " . Xml::element( 'parentid', null, strval( $row->rev_parent_id ) ) . "\n"; + } + + $out .= $this->writeTimestamp( $row->rev_timestamp ); + + if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_USER ) ) { + $out .= " " . Xml::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n"; + } else { + $out .= $this->writeContributor( $row->rev_user, $row->rev_user_text ); + } + + if ( isset( $row->rev_minor_edit ) && $row->rev_minor_edit ) { + $out .= " <minor/>\n"; + } + if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_COMMENT ) ) { + $out .= " " . Xml::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n"; + } else { + $comment = CommentStore::getStore()->getComment( 'rev_comment', $row )->text; + if ( $comment != '' ) { + $out .= " " . Xml::elementClean( 'comment', [], strval( $comment ) ) . "\n"; + } + } + + if ( isset( $row->rev_content_model ) && !is_null( $row->rev_content_model ) ) { + $content_model = strval( $row->rev_content_model ); + } else { + // probably using $wgContentHandlerUseDB = false; + $title = Title::makeTitle( $row->page_namespace, $row->page_title ); + $content_model = ContentHandler::getDefaultModelFor( $title ); + } + + $content_handler = ContentHandler::getForModelID( $content_model ); + + if ( isset( $row->rev_content_format ) && !is_null( $row->rev_content_format ) ) { + $content_format = strval( $row->rev_content_format ); + } else { + // probably using $wgContentHandlerUseDB = false; + $content_format = $content_handler->getDefaultFormat(); + } + + $out .= " " . Xml::element( 'model', null, strval( $content_model ) ) . "\n"; + $out .= " " . Xml::element( 'format', null, strval( $content_format ) ) . "\n"; + + $text = ''; + if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_TEXT ) ) { + $out .= " " . Xml::element( 'text', [ 'deleted' => 'deleted' ] ) . "\n"; + } elseif ( isset( $row->old_text ) ) { + // Raw text from the database may have invalid chars + $text = strval( Revision::getRevisionText( $row ) ); + $text = $content_handler->exportTransform( $text, $content_format ); + $out .= " " . Xml::elementClean( 'text', + [ 'xml:space' => 'preserve', 'bytes' => intval( $row->rev_len ) ], + strval( $text ) ) . "\n"; + } else { + // Stub output + $out .= " " . Xml::element( 'text', + [ 'id' => $row->rev_text_id, 'bytes' => intval( $row->rev_len ) ], + "" ) . "\n"; + } + + if ( isset( $row->rev_sha1 ) + && $row->rev_sha1 + && !( $row->rev_deleted & Revision::DELETED_TEXT ) + ) { + $out .= " " . Xml::element( 'sha1', null, strval( $row->rev_sha1 ) ) . "\n"; + } else { + $out .= " <sha1/>\n"; + } + + // Avoid PHP 7.1 warning from passing $this by reference + $writer = $this; + Hooks::run( 'XmlDumpWriterWriteRevision', [ &$writer, &$out, $row, $text ] ); + + $out .= " </revision>\n"; + + return $out; + } + + /** + * Dumps a "<logitem>" section on the output stream, with + * data filled in from the given database row. + * + * @param object $row + * @return string + * @access private + */ + function writeLogItem( $row ) { + $out = " <logitem>\n"; + $out .= " " . Xml::element( 'id', null, strval( $row->log_id ) ) . "\n"; + + $out .= $this->writeTimestamp( $row->log_timestamp, " " ); + + if ( $row->log_deleted & LogPage::DELETED_USER ) { + $out .= " " . Xml::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n"; + } else { + $out .= $this->writeContributor( $row->log_user, $row->user_name, " " ); + } + + if ( $row->log_deleted & LogPage::DELETED_COMMENT ) { + $out .= " " . Xml::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n"; + } else { + $comment = CommentStore::getStore()->getComment( 'log_comment', $row )->text; + if ( $comment != '' ) { + $out .= " " . Xml::elementClean( 'comment', null, strval( $comment ) ) . "\n"; + } + } + + $out .= " " . Xml::element( 'type', null, strval( $row->log_type ) ) . "\n"; + $out .= " " . Xml::element( 'action', null, strval( $row->log_action ) ) . "\n"; + + if ( $row->log_deleted & LogPage::DELETED_ACTION ) { + $out .= " " . Xml::element( 'text', [ 'deleted' => 'deleted' ] ) . "\n"; + } else { + $title = Title::makeTitle( $row->log_namespace, $row->log_title ); + $out .= " " . Xml::elementClean( 'logtitle', null, self::canonicalTitle( $title ) ) . "\n"; + $out .= " " . Xml::elementClean( 'params', + [ 'xml:space' => 'preserve' ], + strval( $row->log_params ) ) . "\n"; + } + + $out .= " </logitem>\n"; + + return $out; + } + + /** + * @param string $timestamp + * @param string $indent Default to six spaces + * @return string + */ + function writeTimestamp( $timestamp, $indent = " " ) { + $ts = wfTimestamp( TS_ISO_8601, $timestamp ); + return $indent . Xml::element( 'timestamp', null, $ts ) . "\n"; + } + + /** + * @param int $id + * @param string $text + * @param string $indent Default to six spaces + * @return string + */ + function writeContributor( $id, $text, $indent = " " ) { + $out = $indent . "<contributor>\n"; + if ( $id || !IP::isValid( $text ) ) { + $out .= $indent . " " . Xml::elementClean( 'username', null, strval( $text ) ) . "\n"; + $out .= $indent . " " . Xml::element( 'id', null, strval( $id ) ) . "\n"; + } else { + $out .= $indent . " " . Xml::elementClean( 'ip', null, strval( $text ) ) . "\n"; + } + $out .= $indent . "</contributor>\n"; + return $out; + } + + /** + * Warning! This data is potentially inconsistent. :( + * @param object $row + * @param bool $dumpContents + * @return string + */ + function writeUploads( $row, $dumpContents = false ) { + if ( $row->page_namespace == NS_FILE ) { + $img = wfLocalFile( $row->page_title ); + if ( $img && $img->exists() ) { + $out = ''; + foreach ( array_reverse( $img->getHistory() ) as $ver ) { + $out .= $this->writeUpload( $ver, $dumpContents ); + } + $out .= $this->writeUpload( $img, $dumpContents ); + return $out; + } + } + return ''; + } + + /** + * @param File $file + * @param bool $dumpContents + * @return string + */ + function writeUpload( $file, $dumpContents = false ) { + if ( $file->isOld() ) { + $archiveName = " " . + Xml::element( 'archivename', null, $file->getArchiveName() ) . "\n"; + } else { + $archiveName = ''; + } + if ( $dumpContents ) { + $be = $file->getRepo()->getBackend(); + # Dump file as base64 + # Uses only XML-safe characters, so does not need escaping + # @todo Too bad this loads the contents into memory (script might swap) + $contents = ' <contents encoding="base64">' . + chunk_split( base64_encode( + $be->getFileContents( [ 'src' => $file->getPath() ] ) ) ) . + " </contents>\n"; + } else { + $contents = ''; + } + if ( $file->isDeleted( File::DELETED_COMMENT ) ) { + $comment = Xml::element( 'comment', [ 'deleted' => 'deleted' ] ); + } else { + $comment = Xml::elementClean( 'comment', null, strval( $file->getDescription() ) ); + } + return " <upload>\n" . + $this->writeTimestamp( $file->getTimestamp() ) . + $this->writeContributor( $file->getUser( 'id' ), $file->getUser( 'text' ) ) . + " " . $comment . "\n" . + " " . Xml::element( 'filename', null, $file->getName() ) . "\n" . + $archiveName . + " " . Xml::element( 'src', null, $file->getCanonicalUrl() ) . "\n" . + " " . Xml::element( 'size', null, $file->getSize() ) . "\n" . + " " . Xml::element( 'sha1base36', null, $file->getSha1() ) . "\n" . + " " . Xml::element( 'rel', null, $file->getRel() ) . "\n" . + $contents . + " </upload>\n"; + } + + /** + * Return prefixed text form of title, but using the content language's + * canonical namespace. This skips any special-casing such as gendered + * user namespaces -- which while useful, are not yet listed in the + * XML "<siteinfo>" data so are unsafe in export. + * + * @param Title $title + * @return string + * @since 1.18 + */ + public static function canonicalTitle( Title $title ) { + if ( $title->isExternal() ) { + return $title->getPrefixedText(); + } + + global $wgContLang; + $prefix = $wgContLang->getFormattedNsText( $title->getNamespace() ); + + // @todo Emit some kind of warning to the user if $title->getNamespace() !== + // NS_MAIN and $prefix === '' (viz. pages in an unregistered namespace) + + if ( $prefix !== '' ) { + $prefix .= ':'; + } + + return $prefix . $title->getText(); + } +} |