Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: Wikipedia: Mediawiki-CVS

SVN: [40100] trunk/extensions

 

 

Wikipedia mediawiki-cvs RSS feed   Index | Next | Previous | View Threaded


brion at svn

Aug 27, 2008, 12:56 PM

Post #1 of 1 (32 views)
Permalink
SVN: [40100] trunk/extensions

Revision: 40100
Author: brion
Date: 2008-08-27 19:56:07 +0000 (Wed, 27 Aug 2008)

Log Message:
-----------
Add extension for the OpenSearch suggest XML variant supported by IE 8 beta 2.
Includes preliminary versions of some text & image extraction code which will be generalized to other search interfaces pending improved support:

* OpenSearch suggest JSON interface provides for text extracts, but I believe Firefox doesn't support it currently
* Image extractions could be used nicely for on-site search, and we'll want it for future geo-type searches.
* Improved text extraction could be used to update the abstract extension and various other list-like interfaces.

Added Paths:
-----------
trunk/extensions/OpenSearchXml/
trunk/extensions/OpenSearchXml/ApiOpenSearchXml.php
trunk/extensions/OpenSearchXml/OpenSearchXml.php

Added: trunk/extensions/OpenSearchXml/ApiOpenSearchXml.php
===================================================================
--- trunk/extensions/OpenSearchXml/ApiOpenSearchXml.php (rev 0)
+++ trunk/extensions/OpenSearchXml/ApiOpenSearchXml.php 2008-08-27 19:56:07 UTC (rev 40100)
@@ -0,0 +1,340 @@
+<?php
+
+/*
+ * Created on Oct 13, 2006
+ * Adapted to XML output variant, plus extra text extraction 2008
+ * Text extraction adapted from ActiveAbstract extension.
+ *
+ * API for MediaWiki 1.8+
+ *
+ * Copyright (C) 2006 Yuri Astrakhan <Firstname><Lastname>@gmail.com
+ * Copyright (C) 2008 Brion Vibber <brion[at]wikimedia.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+
+/**
+ * @ingroup API
+ */
+class ApiOpenSearchXml extends ApiOpenSearch {
+
+ public function getCustomPrinter() {
+ $format = $this->validateFormat();
+ $printer = $this->getMain()->createPrinterByName( $format );
+ if( $this->inXmlMode() ) {
+ $printer->setRootElement( 'SearchSuggestion' );
+ }
+ return $printer;
+ }
+
+ protected function validateFormat() {
+ $params = $this->extractRequestParams();
+ $format = $params['format'];
+ $allowed = array( 'json', 'jsonfm', 'xml', 'xmlfm' );
+ if( in_array( $format, $allowed ) ) {
+ return $format;
+ } else {
+ return $allowed[0];
+ }
+ }
+
+ protected function inXmlMode() {
+ $format = $this->validateFormat();
+ return ($format == 'xml' || $format == 'xmlfm');
+ }
+
+ public function execute() {
+ if (!$this->inXmlMode()) {
+ // Pass back to the JSON defaults
+ return parent::execute();
+ }
+
+ $params = $this->extractRequestParams();
+ $search = $params['search'];
+ $limit = $params['limit'];
+ $namespaces = $params['namespace'];
+
+ // Open search results may be stored for a very long time
+ $this->getMain()->setCacheMaxAge(1200);
+
+ $srchres = PrefixSearch::titleSearch( $search, $limit, $namespaces );
+
+ $items = array_filter( array_map( array( $this, 'formatItem' ), $srchres ) );
+
+ $result = $this->getResult();
+ $result->addValue( null, 'version', '2.0' );
+ $result->addValue( null, 'xmlns', 'http://opensearch.org/searchsuggest2' );
+ $result->addValue( null, 'Query', array( '*' => strval( $search ) ) );
+ $result->setIndexedTagName( $items, 'Item' );
+ $result->addValue( null, 'Section', $items );
+ }
+
+ public function getAllowedParams() {
+ $params = parent::getAllowedParams();
+ $params['format'] = null;
+ return $params;
+ }
+
+ protected function formatItem( $name ) {
+ $title = TItle::newFromText( $name );
+ if( $title ) {
+ $title = $this->_checkRedirect( $title );
+ if( $this->_seen( $title ) ) {
+ return false;
+ }
+
+ list( $extract, $badge ) = $this->getExtract( $title );
+ $image = $this->getBadge( $title, $badge );
+
+ $item['Text']['*'] = $title->getPrefixedText();
+ $item['Description']['*'] = $extract;
+ $item['Url']['*'] = $title->getFullUrl();
+ if( $image ) {
+ $thumb = $image->getThumbnail( 50, 50 );
+ $item['Image'] = array(
+ 'source' => wfExpandUrl( $thumb->getUrl() ),
+ //alt
+ 'width' => $thumb->getWidth(),
+ 'height' => $thumb->getHeight() );
+ }
+ } else {
+ $item = array( 'Text' => array( '*' => $name ) );
+ }
+ return $item;
+ }
+
+ protected function _checkRedirect( $title ) {
+ $art = new Article( $title );
+ $target = $art->getRedirectTarget();
+ if( $target ) {
+ return $target;
+ } else {
+ return $title;
+ }
+ }
+
+ protected function _seen( $title ) {
+ $name = $title->getPrefixedText();
+ if( isset( $this->mSeen[$name] ) ) {
+ return true;
+ }
+ $this->mSeen[$name] = true;
+ return false;
+ }
+
+ /**
+ * Strip markup to show plaintext
+ * @param string $text
+ * @return string
+ * @access private
+ */
+ function _stripMarkup( $text ) {
+ global $wgContLang;
+
+ $text = substr( $text, 0, 4096 ); // don't bother with long text...
+
+ $text = str_replace( "'''", "", $text );
+ $text = str_replace( "''", "", $text );
+
+ $text = preg_replace( '#__[a-z0-9_]+__#i', '', $text ); // magic words
+
+ $cleanChar = "[^|\[\]]";
+ $subLink = "\[\[$cleanChar*(?:\|$cleanChar*)*\]\]";
+ $pipeContents = "(?:$cleanChar|$subLink)*";
+ $text = preg_replace_callback( "#
+ \[\[
+ ($cleanChar*)
+ (?:\|($pipeContents))?
+ (?:\|$pipeContents)*
+ \]\]
+ #six", array( $this, '_stripLink' ), $text );
+
+ $protocols = wfUrlProtocols();
+ $text = preg_replace( '#\\[(?:$protocols).*? (.*?)\\]#s', '$1', $text ); // URL links
+ $text = preg_replace( '#</?[a-z0-9]+.*?>#s', '', $text ); // HTML-style tags
+ $text = preg_replace( '#\\{\\|.*?\\|\\}#s', '', $text ); // tables
+
+ $text = preg_replace( '#^:.*$#m', '', $text ); // indented lines near start are usually disambigs or notices
+ $text = Sanitizer::decodeCharReferences( $text );
+ return trim( $text );
+ }
+
+ function _stripLink( $matches ) {
+ $target = trim( $matches[1] );
+ if( isset( $matches[2] ) ) {
+ $text = trim( $matches[2] );
+ } else {
+ $text = $target;
+ }
+
+ $title = Title::newFromText( $target );
+ if( $title ) {
+ $ns = $title->getNamespace();
+ if( $title->getInterwiki() || $ns == NS_IMAGE || $ns == NS_CATEGORY ) {
+ return "";
+ } else {
+ return $text;
+ }
+ } else {
+ return $matches[0];
+ }
+ }
+
+ /**
+ * Extract the first two sentences, if detectable, from the text.
+ * @param string $text
+ * @return string
+ * @access private
+ */
+ function _extractStart( $text ) {
+ $endchars = array(
+ '([^\d])\.\s', '\!\s', '\?\s', // regular ASCII
+ '。', // full-width ideographic full-stop
+ '.', '!', '?', // double-width roman forms
+ '。', // half-width ideographic full stop
+ );
+
+ $endgroup = implode( '|', $endchars );
+ $end = "(?:$endgroup)";
+ $sentence = ".*?$end+";
+ $firstone = "/^($sentence)/u";
+ if( preg_match( $firstone, $text, $matches ) ) {
+ return $matches[1];
+ } else {
+ // Just return the first line
+ $lines = explode( "\n", $text );
+ return trim( $lines[0] );
+ }
+ }
+
+ /**
+ * Grab the first thing that looks like an image link from the body text.
+ * This will exclude any templates, including infoboxes...
+ */
+ function _extractBadge( $text ) {
+ global $wgContLang;
+ $image = preg_quote( $wgContLang->getNsText( NS_IMAGE ), '#' );
+ if( preg_match( "#\[\[\s*(?:image|$image)\s*:\s*([^|\]]+)#", $text, $matches ) ) {
+ return trim( $matches[1] );
+ } else {
+ return false;
+ }
+ }
+
+ function _validateBadge( $arg ) {
+ // Some templates want an entire [[Image:Foo.jpg|250px]]
+ if( substr( $arg, 0, 2 ) == '[.[.' ) {
+ return $this->_extractBadge( $arg );
+ }
+
+ // Others will take Image:Foo.jpg or Foo.jpg
+ $title = Title::newFromText( $arg, NS_IMAGE );
+ if( $title && $title->getNamespace() == NS_IMAGE ) {
+ return $title->getDbKey();
+ }
+ return false;
+ }
+
+ protected function getExtract( $title, $chars=50 ) {
+ $rev = Revision::newFromTitle( $title );
+ if( $rev ) {
+ $text = substr( $rev->getText(), 0, 16384 );
+
+ // Ok, first note this is a TERRIBLE HACK. :D
+ //
+ // First, we use the system preprocessor to break down the text
+ // into text, templates, extensions, and comments:
+ global $wgParser;
+ $wgParser->clearState();
+ $wgParser->mOptions = new ParserOptions();
+ $frame = $wgParser->getPreprocessor()->newFrame();
+ $dom = $wgParser->preprocessToDom( $text );
+
+ $imageArgs = array(
+ 'image',
+ 'image_skyline',
+ 'img',
+ 'Img',
+ );
+
+ // Now, we strip out everything that's not text.
+ // This works with both DOM and Hash parsers, but feels fragile.
+ $node = $dom->getFirstChild();
+ $out = '';
+ $badge = false;
+ while( $node ) {
+ if( $node->getName() == '#text' ) {
+ $out .= $frame->expand( $node, PPFrame::RECOVER_ORIG );
+ } elseif( !$badge && $node->getName() == 'template' ) {
+ // Look for an "image" parameter child node
+ $parts = $node->getFirstChild();
+ while( $parts ) {
+ if( $parts->getName() == "part" ) {
+ $arg = $parts->splitArg();
+ //var_dump( $arg );
+ $argName = trim( $frame->expand( $arg["name"], PPFrame::RECOVER_ORIG ) );
+ if( in_array( $argName, $imageArgs ) ) {
+ $badge = $this->_validateBadge(
+ trim(
+ $frame->expand( $arg["value"], PPFrame::RECOVER_ORIG ) ) );
+ if( $badge ) {
+ break; // from the arg loop
+ }
+ }
+ }
+ $parts = $parts->getNextSibling();
+ }
+ }
+ $node = $node->getNextSibling();
+ }
+
+ if( !$badge ) {
+ // Look for the first image in the body text if there wasn't
+ // one in an infobox.
+ $badge = $this->_extractBadge( $out );
+ }
+
+ // The remaining text may still contain wiki and HTML markup.
+ // We'll use our shitty hand parser to strip most of those from
+ // the beginning of the text.
+ $stripped = $this->_stripMarkup( $out );
+
+ // And now, we'll grab just the first sentence as text, and
+ // also try to rip out a badge image.
+ return array(
+ $this->_extractStart( $stripped ),
+ $badge );
+ }
+ return '';
+ }
+
+ protected function getBadge( $title, $fromText ) {
+ if( $title->getNamespace() == NS_IMAGE ) {
+ $image = wfFindFile( $title );
+ if( $image && $image->exists() ) {
+ return $image;
+ }
+ } else {
+ // See if we found an [[Image:xxx]] in the text...
+ if( $fromText ) {
+ $image = wfFindFile( $fromText );
+ if( $image && $image->exists() ) {
+ return $image;
+ }
+ }
+ }
+ }
+}


Property changes on: trunk/extensions/OpenSearchXml/ApiOpenSearchXml.php
___________________________________________________________________
Name: svn:eol-style
+ native

Added: trunk/extensions/OpenSearchXml/OpenSearchXml.php
===================================================================
--- trunk/extensions/OpenSearchXml/OpenSearchXml.php (rev 0)
+++ trunk/extensions/OpenSearchXml/OpenSearchXml.php 2008-08-27 19:56:07 UTC (rev 40100)
@@ -0,0 +1,48 @@
+<?php
+
+/*
+ * Copyright (C) 2008 Brion Vibber <brion[at]wikimedia.org>
+ * http://www.mediawiki.org/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+
+$wgAPIModules['opensearch'] = 'ApiOpenSearchXml';
+$wgAutoloadClasses['ApiOpenSearchXml'] =
+ dirname(__FILE__) . '/ApiOpenSearchXml.php';
+
+$wgHooks['OpenSearchUrls'][] = 'efOpenSearchXmlUrls';
+
+$wgOpenSearchAdvertiseXml = true;
+
+function efOpenSearchXmlUrls( &$urls ) {
+ global $wgEnableAPI, $wgOpenSearchAdvertiseXml;
+ if( $wgEnableAPI && $wgOpenSearchAdvertiseXml ) {
+ $urls[] = array(
+ 'type' => 'application/x-suggestions+xml',
+ 'method' => 'get',
+ 'template' => efOpenSearchXmlTemplate() );
+
+ }
+ return true;
+}
+
+function efOpenSearchXmlTemplate() {
+ global $wgServer, $wgScriptPath;
+ $ns = implode( ',', SearchEngine::defaultNamespaces() );
+ if(!$ns) $ns = "0";
+ return $wgServer . $wgScriptPath . '/api.php?action=opensearch&format=xml&search={searchTerms}&namespace='.$ns;
+}


Property changes on: trunk/extensions/OpenSearchXml/OpenSearchXml.php
___________________________________________________________________
Name: svn:eol-style
+ native



_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS[at]lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Wikipedia mediawiki-cvs RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact lists@gossamer-threads.com
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.