from __future__ import(division, absolute_import, print_function,
unicode_literals)
from multiprocessing import Process
import time
from pprint import pprint
from pysimsearch.sim_index import MemorySimIndex
from pysimsearch.sim_index import RemoteSimIndex
from pysimsearch.sim_index import SimIndexCollection
from pysimsearch import similarity
from pysimsearch import sim_server
def sample_similarity():
# Compare web-page similarities
print()
print("Printing pairwise similarities of university homepages")
similarities = similarity.pairwise_compare_filenames('http://www.stanford.edu/',
'http://www.berkeley.edu/',
'http://www.ucla.edu',
'http://www.mit.edu/')
pprint(similarities)
def sample_sim_index():
# Create an in-memory index and query it
print()
print("Creating in-memory index of university homepages")
sim_index = MemorySimIndex()
sim_index.index_filenames('http://www.stanford.edu/',
'http://www.berkeley.edu',
'http://www.ucla.edu',
'http://www.mit.edu')
print("Postings list for 'university':")
pprint(sim_index.postings_list('university'))
print("Pages containing terms 'university' and 'california'")
pprint(list(sim_index.docnames_with_terms('university', 'california')))
# Issue some similarity queries
print()
print("Similarity search for query 'stanford university' (simple scorer)")
sim_index.set_query_scorer('simple_count')
pprint(list(sim_index.query_by_string("stanford university")))
print()
print("Similarity search for query 'stanford university' (tf.idf scorer)")
sim_index.set_query_scorer('tfidf')
pprint(list(sim_index.query_by_string("stanford university")))
# Save the index to disk, then load it back in
print()
print("Saving index to disk")
with open("myindex.idx", "w") as index_file:
sim_index.save(index_file)
print()
print("Loading index from disk")
with open("myindex.idx", "r") as index_file:
sim_index2 = MemorySimIndex.load(index_file)
print()
print("Pages containing terms 'university' and 'california' in loaded index")
pprint(list(sim_index2.docnames_with_terms('university', 'california')))
def sample_sim_index_collection():
# SimIndexCollection
print()
print("SimIndexCollection: build a collection, index some urls, and query it")
indexes = (MemorySimIndex(), MemorySimIndex())
index_coll = SimIndexCollection()
index_coll.add_shards(*indexes)
index_coll.set_query_scorer('tfidf')
index_coll.index_urls('http://www.stanford.edu/',
'http://www.berkeley.edu',
'http://www.ucla.edu',
'http://www.mit.edu')
pprint(index_coll.query_by_string('stanford university'))
def sample_remote_indexes():
print()
print("SimIndexCollection with remote backend indexes")
processes = []
for i in range(2):
port = 9000 + i
process = Process(target=sim_server.start_sim_index_server,
kwargs={'port': port, 'logRequests': False})
process.daemon = True
processes.append(process)
for process in processes:
process.start()
print("Waiting for servers to start")
time.sleep(1)
remote_index_coll = SimIndexCollection()
for i in range(2):
port = 9000 + i
remote_index_coll.add_shards(
RemoteSimIndex("http://localhost:{}/RPC2".format(port)))
remote_index_coll.set_query_scorer('tfidf')
remote_index_coll.index_urls('http://www.stanford.edu/',
'http://www.berkeley.edu',
'http://www.ucla.edu',
'http://www.mit.edu')
pprint(remote_index_coll.query_by_string('stanford university'))
for process in processes:
process.terminate()
if __name__ == '__main__':
sample_similarity()
sample_sim_index()
sample_sim_index_collection()
sample_remote_indexes()
pprint('done!')