Continuing from the Scraping webpages with Python and QWebElement post here is the second in this series where finally a real search happens.
#!/usr/bin/python # These lines will get us the modules we need. from PyQt4.QtCore import QUrl, SIGNAL from PyQt4.QtGui import QApplication from PyQt4.QtWebKit import QWebPage, QWebView import pprint class Scrape(QApplication): def __init__(self): # Apparently there are a number of versions of this init and PyQT # figures out which you want based on the number of arguments. So pass # in one argument but we do not need anything really, so None. super(Scrape, self).__init__(None) # Create a QWebView instance and store it. self.webView = QWebView() # Connect our searchform method to the searchform signal of this new # QWebView. self.webView.loadFinished.connect(self.searchForm) def load(self, url): # In the __init__ we stored a QWebView instance into self.webView so # we can load a url into it. It needs a QUrl instance though. self.webView.load(QUrl(url)) def searchForm(self): # We landed here because the load is finished. Now, load the root document # element. It'll be a QWebElement instance. QWebElement is a QT4.6 # addition and it allows easier DOM interaction. documentElement = self.webView.page().currentFrame().documentElement() # Let's find the search input element. inputSearch = documentElement.findFirst('input[title="Google Search"]') inputSearch.setAttribute('value', 'drupal') # Disconnect ourselves from the signal. self.webView.loadFinished.disconnect(self.searchForm) # And connect the next function. self.webView.loadFinished.connect(self.searchResults) documentElement.findFirst('input[name=btnG]').evaluateJavaScript('this.click()') def searchResults(self): # As seen above, first grab the root document element and then load all g # classed list items. results = self.webView.page().currentFrame().documentElement().findAll('li.g') # Change the resulting QWebElementCollection into a list so we can easily # iterate over it. for e in results.toList(): # Just print the results. print e.toOuterXml().toAscii() # We are inside a QT application and need to terminate that properly. self.exit() # Instantiate our class. my_scrape = Scrape() # Load the Google homepage. my_scrape.load('http://google.com/ncr') # Start the QT event loop. my_scrape.exec_()
Update: Daniel Wehner have written a real world use script based on this post to log in to Cisco authentication.
Commenting on this Story is closed.