Source code for wbia_cnn.ingest_data

# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function
from wbia_cnn import utils
from wbia_cnn import ingest_helpers
from wbia_cnn import ingest_wbia
from wbia_cnn.dataset import DataSet
from os.path import join, basename, splitext
import utool as ut

print, rrr, profile = ut.inject2(__name__)


NOCACHE_DATASET = ut.get_argflag(('--nocache-cnn', '--nocache-dataset'))


[docs]def testdata_dataset(): dataset = get_wbia_patch_siam_dataset(max_examples=5) return dataset
[docs]def testdata_patchmatch(): """ >>> from wbia_cnn.ingest_data import * # NOQA """ dataset = get_wbia_patch_siam_dataset(max_examples=5) data_fpath = dataset.data_fpath labels_fpath = dataset.labels_fpath data_cv2, labels = utils.load(data_fpath, labels_fpath) data = utils.convert_cv2_images_to_theano_images(data_cv2) return data, labels
[docs]def testdata_patchmatch2(): """ >>> from wbia_cnn.ingest_data import * # NOQA """ dataset = get_wbia_patch_siam_dataset(max_examples=5) data_fpath = dataset.data_fpath labels_fpath = dataset.labels_fpath data, labels = utils.load(data_fpath, labels_fpath) return data, labels
[docs]def get_extern_training_dpath(alias_key): return DataSet.from_alias_key(alias_key).training_dpath
[docs]def view_training_directories(): r""" CommandLine: python -m wbia_cnn.ingest_data --test-view_training_directories Example: >>> # UTILITY_SCRIPT >>> from wbia_cnn.ingest_data import * # NOQA >>> result = view_training_directories() >>> print(result) """ ut.vd(ingest_wbia.get_juction_dpath())
[docs]def merge_datasets(dataset_list): """ Merges a list of dataset objects into a single combined dataset. """ def consensus_check_factory(): """ Returns a temporary function used to check that all incoming values with the same key are consistent """ from collections import defaultdict past_values = defaultdict(lambda: None) def consensus_check(value, key): assert ( past_values[key] is None or past_values[key] == value ), 'key=%r with value=%r does not agree with past_value=%r' % ( key, value, past_values[key], ) past_values[key] = value return value return consensus_check total_num_labels = 0 total_num_data = 0 input_alias_list = [dataset.alias_key for dataset in dataset_list] alias_key = 'combo_' + ut.hashstr27(repr(input_alias_list), hashlen=8) training_dpath = ut.ensure_app_resource_dir('wbia_cnn', 'training', alias_key) data_fpath = ut.unixjoin(training_dpath, alias_key + '_data.hdf5') labels_fpath = ut.unixjoin(training_dpath, alias_key + '_labels.hdf5') try: # Try and short circut cached loading merged_dataset = DataSet.from_alias_key(alias_key) return merged_dataset except (Exception, AssertionError) as ex: ut.printex( ex, 'alias definitions have changed. alias_key=%r' % (alias_key,), iswarning=True, ) # Build the dataset consensus_check = consensus_check_factory() for dataset in dataset_list: print(ut.get_file_nBytes_str(dataset.data_fpath)) print(dataset.data_fpath_dict['full']) print(dataset.num_labels) print(dataset.data_per_label) total_num_labels += dataset.num_labels total_num_data += dataset.data_per_label * dataset.num_labels # check that all data_dims agree data_shape = consensus_check(dataset.data_shape, 'data_shape') data_per_label = consensus_check(dataset.data_per_label, 'data_per_label') # hack record this import numpy as np data_dtype = np.uint8 label_dtype = np.int32 data = np.empty((total_num_data,) + data_shape, dtype=data_dtype) labels = np.empty(total_num_labels, dtype=label_dtype) # def iterable_assignment(): # pass data_left = 0 data_right = None labels_left = 0 labels_right = None for dataset in ut.ProgressIter(dataset_list, lbl='combining datasets', freq=1): X_all, y_all = dataset.subset('full') labels_right = labels_left + y_all.shape[0] data_right = data_left + X_all.shape[0] data[data_left:data_right] = X_all labels[labels_left:labels_right] = y_all data_left = data_right labels_left = labels_right ut.save_data(data_fpath, data) ut.save_data(labels_fpath, labels) labels = ut.load_data(labels_fpath) num_labels = len(labels) merged_dataset = DataSet.new_training_set( alias_key=alias_key, data_fpath=data_fpath, labels_fpath=labels_fpath, metadata_fpath=None, training_dpath=training_dpath, data_shape=data_shape, data_per_label=data_per_label, output_dims=1, num_labels=num_labels, ) return merged_dataset
[docs]def grab_dataset(ds_tag=None, datatype='siam-patch'): if datatype == 'siam-patch': return grab_siam_dataset(ds_tag=ds_tag) elif datatype == 'siam-part': return get_wbia_part_siam_dataset() elif datatype == 'category': return grab_mnist_category_dataset()
[docs]def grab_siam_dataset(ds_tag=None): r""" Will build the dataset using the command line if it doesn't exist CommandLine: python -m wbia_cnn.ingest_data --test-grab_siam_dataset --db mnist --show python -m wbia_cnn.ingest_data --test-grab_siam_dataset --db liberty --show python -m wbia_cnn.ingest_data --test-grab_siam_dataset --db PZ_MTEST --show python -m wbia_cnn.ingest_data --test-grab_siam_dataset --db PZ_MTEST --show --nohud --nometa python -m wbia_cnn.ingest_data --test-grab_siam_dataset --db liberty --show --nohud --nometa Example: >>> # ENABLE_DOCTEST >>> from wbia_cnn.ingest_data import * # NOQA >>> ds_tag = None >>> dataset = grab_siam_dataset(ds_tag=ds_tag) >>> ut.quit_if_noshow() >>> from wbia_cnn import draw_results >>> dataset.interact(ibs=dataset.getprop('ibs', None), key='test', chunck_sizes=(8, 4)) >>> ut.show_if_requested() """ if ds_tag is not None: try: return DataSet.from_alias_key(ds_tag) except Exception as ex: ut.printex( ex, 'Could not resolve alias. Need to rebuild dataset', keys=['ds_tag'] ) raise dbname = ut.get_argval('--db') if dbname == 'liberty': pairs = 250000 dataset = grab_liberty_siam_dataset(pairs) elif dbname == 'mnist': dataset = grab_mnist_siam_dataset() else: dataset = get_wbia_patch_siam_dataset() return dataset
[docs]def grab_mnist_category_dataset_float(): r""" CommandLine: python -m wbia_cnn grab_mnist_category_dataset_float python -m wbia_cnn grab_mnist_category_dataset_float --show Example: >>> # DISABLE_DOCTEST >>> from wbia_cnn.ingest_data import * # NOQA >>> dataset = grab_mnist_category_dataset_float() >>> dataset.print_subset_info() >>> dataset.print_dir_tree() >>> ut.quit_if_noshow() >>> inter = dataset.interact() >>> ut.show_if_requested() """ import numpy as np training_dpath = ut.ensure_app_resource_dir('wbia_cnn', 'training') dataset = DataSet( name='mnist_float32', training_dpath=training_dpath, data_shape=(28, 28, 1) ) try: dataset.load() except IOError: data, labels, metadata = ingest_helpers.grab_mnist2() # Get indicies of test / train split splitset = np.array(metadata['splitset']) train_idxs = np.where(splitset == 'train')[0] test_idxs = np.where(splitset == 'test')[0] # Give dataset the full data dataset.save(data, labels, metadata, data_per_label=1) # And the split sets dataset.add_split('train', train_idxs) dataset.add_split('test', test_idxs) dataset.clear_cache() dataset.ensure_symlinked() return dataset
[docs]def grab_mnist_category_dataset(): r""" CommandLine: python -m wbia_cnn grab_mnist_category_dataset python -m wbia_cnn grab_mnist_category_dataset_float python -m wbia_cnn grab_mnist_category_dataset --show Example: >>> # DISABLE_DOCTEST >>> from wbia_cnn.ingest_data import * # NOQA >>> dataset = grab_mnist_category_dataset() >>> dataset.print_subset_info() >>> dataset.print_dir_tree() >>> ut.quit_if_noshow() >>> inter = dataset.interact() >>> ut.show_if_requested() """ import numpy as np training_dpath = ut.ensure_app_resource_dir('wbia_cnn', 'training') dataset = DataSet( name='mnist_uint8', training_dpath=training_dpath, data_shape=(28, 28, 1) ) try: dataset.load() except IOError: data, labels, metadata = ingest_helpers.grab_mnist1() # Get indicies of test / train split train_idxs = np.arange(60000) test_idxs = np.arange(10000) + 60000 # Give dataset the full data dataset.save(data, labels, metadata, data_per_label=1) # And the split sets dataset.add_split('train', train_idxs) dataset.add_split('test', test_idxs) dataset.clear_cache() dataset.ensure_symlinked() return dataset
[docs]def grab_mnist_siam_dataset(): r""" CommandLine: python -m wbia_cnn.ingest_data --test-grab_mnist_siam_dataset --show Example: >>> # ENABLE_DOCTEST >>> from wbia_cnn.ingest_data import * # NOQA >>> dataset = grab_mnist_siam_dataset() >>> ut.quit_if_noshow() >>> from wbia_cnn import draw_results >>> #ibsplugin.rrr() >>> flat_metadata = {} >>> data, labels = dataset.subset('full') >>> ut.quit_if_noshow() >>> dataset.interact() >>> ut.show_if_requested() """ training_dpath = ut.ensure_app_resource_dir('wbia_cnn', 'training') dataset = DataSet( name='mnist_pairs', training_dpath=training_dpath, data_shape=(28, 28, 1), ) try: dataset.load() except IOError: data_, labels_, metadata_ = ingest_helpers.grab_mnist2() data, labels = ingest_helpers.convert_category_to_siam_data(data_, labels_) dataset.save(data, labels, data_per_label=2) return dataset
[docs]def grab_liberty_siam_dataset(pairs=250000): """ References: http://www.cs.ubc.ca/~mbrown/patchdata/patchdata.html https://github.com/osdf/datasets/blob/master/patchdata/dataset.py Notes: "info.txt" contains the match information Each row of info.txt corresponds corresponds to a separate patch, with the patches ordered from left to right and top to bottom in each bitmap image. 3 types of metadata files info.txt - contains patch ids that correspond with the order of patches in the bmp images In the format: pointid, unused interest.txt - interest points corresponding to patches with patchids has same number of rows as info.txt In the format: reference image id, x, y, orientation, scale (in log2 units) m50_<d>_<d>_0.txt - matches files patchID1 3DpointID1 unused1 patchID2 3DpointID2 unused2 CommandLine: python -m wbia_cnn.ingest_data --test-grab_liberty_siam_dataset --show Example: >>> # ENABLE_DOCTEST >>> from wbia_cnn.ingest_data import * # NOQA >>> pairs = 500 >>> dataset = grab_liberty_siam_dataset(pairs) >>> ut.quit_if_noshow() >>> from wbia_cnn import draw_results >>> #ibsplugin.rrr() >>> flat_metadata = {} >>> data, labels = dataset.subset('full') >>> ut.quit_if_noshow() >>> warped_patch1_list = data[::2] >>> warped_patch2_list = data[1::2] >>> dataset.interact() >>> ut.show_if_requested() """ datakw = { 'detector': 'dog', 'pairs': pairs, } assert datakw['detector'] in ['dog', 'harris'] assert pairs in [500, 50000, 100000, 250000] liberty_urls = { 'dog': 'http://www.cs.ubc.ca/~mbrown/patchdata/liberty.zip', 'harris': 'http://www.cs.ubc.ca/~mbrown/patchdata/liberty_harris.zip', } url = liberty_urls[datakw['detector']] ds_path = ut.grab_zipped_url(url) ds_name = splitext(basename(ds_path))[0] alias_key = 'liberty;' + ut.dict_str(datakw, nl=False, explicit=True) cfgstr = ','.join([str(val) for key, val in ut.iteritems_sorted(datakw)]) # TODO: allow a move of the base data prefix training_dpath = ut.ensure_app_resource_dir('wbia_cnn', 'training', ds_name) if ut.get_argflag('--vtd'): ut.vd(training_dpath) ut.ensuredir(training_dpath) data_fpath = join(training_dpath, 'liberty_data_' + cfgstr + '.pkl') labels_fpath = join(training_dpath, 'liberty_labels_' + cfgstr + '.pkl') if not ut.checkpath(data_fpath, verbose=True): data, labels = ingest_helpers.extract_liberty_style_patches(ds_path, pairs) ut.save_data(data_fpath, data) ut.save_data(labels_fpath, labels) # hack for caching num_labels labels = ut.load_data(labels_fpath) num_labels = len(labels) dataset = DataSet.new_training_set( alias_key=alias_key, data_fpath=data_fpath, labels_fpath=labels_fpath, metadata_fpath=None, training_dpath=training_dpath, data_shape=(64, 64, 1), data_per_label=2, output_dims=1, num_labels=num_labels, ) return dataset
[docs]def get_wbia_patch_siam_dataset(**kwargs): """ CommandLine: python -m wbia_cnn.ingest_data --test-get_wbia_patch_siam_dataset --show python -m wbia_cnn.ingest_data --test-get_wbia_patch_siam_dataset --show --db PZ_Master1 --acfg_name default python -m wbia_cnn.ingest_data --test-get_wbia_patch_siam_dataset --show --db PZ_Master1 --acfg_name timectrl python -m wbia_cnn.ingest_data --test-get_wbia_patch_siam_dataset --show --db PZ_MTEST --acfg_name unctrl --dryrun Example: >>> # ENABLE_DOCTEST >>> from wbia_cnn.ingest_data import * # NOQA >>> from wbia_cnn import draw_results >>> import wbia >>> kwargs = {} # ut.argparse_dict({'max_examples': None, 'num_top': 3}) >>> dataset = get_wbia_patch_siam_dataset(**kwargs) >>> ut.quit_if_noshow() >>> dataset.interact() >>> ut.show_if_requested() """ datakw = ut.argparse_dict( { #'db': 'PZ_MTEST', 'max_examples': None, #'num_top': 3, 'num_top': None, 'min_featweight': 0.8 if not ut.WIN32 else None, 'controlled': True, 'colorspace': 'gray', 'acfg_name': None, }, alias_dict={'acfg_name': ['acfg', 'a']}, verbose=True, ) datakw.update(kwargs) # ut.get_func_kwargs(ingest_wbia.get_aidpairs_and_matches) if datakw['acfg_name'] is not None: del datakw['controlled'] if datakw['max_examples'] is None: del datakw['max_examples'] if datakw['num_top'] is None: del datakw['num_top'] with ut.Indenter('[LOAD IBEIS DB]'): import wbia dbname = ut.get_argval('--db', default='PZ_MTEST') ibs = wbia.opendb(dbname=dbname, defaultdb='PZ_MTEST') # Nets dir is the root dir for all training on this data training_dpath = ibs.get_neuralnet_dir() ut.ensuredir(training_dpath) print('\n\n[get_wbia_patch_siam_dataset] START') # log_dir = join(training_dpath, 'logs') # ut.start_logging(log_dir=log_dir) alias_key = ibs.get_dbname() + ';' + ut.dict_str(datakw, nl=False, explicit=True) try: if NOCACHE_DATASET: raise Exception('forced cache off') # Try and short circut cached loading dataset = DataSet.from_alias_key(alias_key) dataset.setprop('ibs', lambda: wbia.opendb(db=dbname)) return dataset except Exception as ex: ut.printex( ex, 'alias definitions have changed. alias_key=%r' % (alias_key,), iswarning=True, ) with ut.Indenter('[BuildDS]'): # Get training data pairs colorspace = datakw.pop('colorspace') patchmatch_tup = ingest_wbia.get_aidpairs_and_matches(ibs, **datakw) ( aid1_list, aid2_list, kpts1_m_list, kpts2_m_list, fm_list, metadata_lists, ) = patchmatch_tup # Extract and cache the data # TODO: metadata if ut.get_argflag('--dryrun'): print('exiting due to dry run') import sys sys.exit(0) tup = ingest_wbia.cached_patchmetric_training_data_fpaths( ibs, aid1_list, aid2_list, kpts1_m_list, kpts2_m_list, fm_list, metadata_lists, colorspace=colorspace, ) data_fpath, labels_fpath, metadata_fpath, training_dpath, data_shape = tup print('\n[get_wbia_patch_siam_dataset] FINISH\n\n') # hack for caching num_labels labels = ut.load_data(labels_fpath) num_labels = len(labels) dataset = DataSet.new_training_set( alias_key=alias_key, data_fpath=data_fpath, labels_fpath=labels_fpath, metadata_fpath=metadata_fpath, training_dpath=training_dpath, data_shape=data_shape, data_per_label=2, output_dims=1, num_labels=num_labels, ) dataset.setprop('ibs', ibs) return dataset
[docs]def get_wbia_part_siam_dataset(**kwargs): """ PARTS based network data CommandLine: python -m wbia_cnn.ingest_data --test-get_wbia_part_siam_dataset --show python -m wbia_cnn.ingest_data --test-get_wbia_part_siam_dataset --show --db PZ_Master1 --acfg_name timectrl python -m wbia_cnn.ingest_data --test-get_wbia_part_siam_dataset --show --db PZ_MTEST --acfg_name unctrl --dryrun Example: >>> # ENABLE_DOCTEST >>> from wbia_cnn.ingest_data import * # NOQA >>> from wbia_cnn import draw_results >>> import wbia >>> kwargs = {} # ut.argparse_dict({'max_examples': None, 'num_top': 3}) >>> dataset = get_wbia_part_siam_dataset(**kwargs) >>> ut.quit_if_noshow() >>> dataset.interact(ibs=dataset.getprop('ibs')) >>> ut.show_if_requested() """ import wbia datakw = ut.argparse_dict( { 'colorspace': 'gray', 'acfg_name': 'ctrl', #'db': None, 'db': 'PZ_MTEST', }, alias_dict={'acfg_name': ['acfg']}, verbose=True, ) datakw.update(kwargs) print('\n\n[get_wbia_part_siam_dataset] START') alias_key = ut.dict_str(datakw, nl=False, explicit=True) dbname = datakw.pop('db') try: if NOCACHE_DATASET: raise Exception('forced cache off') # Try and short circut cached loading dataset = DataSet.from_alias_key(alias_key) dataset.setprop('ibs', lambda: wbia.opendb(db=dbname)) return dataset except Exception as ex: ut.printex( ex, 'alias definitions have changed. alias_key=%r' % (alias_key,), iswarning=True, ) with ut.Indenter('[LOAD IBEIS DB]'): ibs = wbia.opendb(db=dbname) # Nets dir is the root dir for all training on this data training_dpath = ibs.get_neuralnet_dir() ut.ensuredir(training_dpath) with ut.Indenter('[BuildDS]'): # Get training data pairs colorspace = datakw.pop('colorspace') (aid_pairs, label_list, flat_metadata) = ingest_wbia.get_aidpairs_partmatch( ibs, **datakw ) # Extract and cache the data, labels, and metadata if ut.get_argflag('--dryrun'): print('exiting due to dry run') import sys sys.exit(0) tup = ingest_wbia.cached_part_match_training_data_fpaths( ibs, aid_pairs, label_list, flat_metadata, colorspace=colorspace ) data_fpath, labels_fpath, metadata_fpath, training_dpath, data_shape = tup print('\n[get_wbia_part_siam_dataset] FINISH\n\n') # hack for caching num_labels labels = ut.load_data(labels_fpath) num_labels = len(labels) dataset = DataSet.new_training_set( alias_key=alias_key, data_fpath=data_fpath, labels_fpath=labels_fpath, metadata_fpath=metadata_fpath, training_dpath=training_dpath, data_shape=data_shape, data_per_label=2, output_dims=1, num_labels=num_labels, ) dataset.setprop('ibs', ibs) return dataset
[docs]def get_numpy_dataset(data_fpath, labels_fpath, training_dpath): """""" import numpy as np # hack for caching num_labels data = np.load(data_fpath) data_shape = data.shape[1:] labels = np.load(labels_fpath) num_labels = len(labels) alias_key = 'temp' ut.ensuredir(training_dpath) dataset = DataSet.new_training_set( alias_key=alias_key, data_fpath=data_fpath, labels_fpath=labels_fpath, metadata_fpath=None, training_dpath=training_dpath, data_shape=data_shape, data_per_label=1, output_dims=1, num_labels=num_labels, ) return dataset
[docs]def get_numpy_dataset2(name, data_fpath, labels_fpath, training_dpath, cache=True): """""" import numpy as np # hack for caching num_labels data = np.load(data_fpath) data_shape = data.shape[1:] labels = np.load(labels_fpath) num_labels = len(labels) metadata = None dataset = DataSet( name=name, training_dpath=training_dpath, data_shape=data_shape, ) error = False try: dataset.load() except IOError: error = True if error or not cache: import random # Get indicies of valid / train split idx_list = list(range(num_labels)) random.shuffle(idx_list) split_idx = int(num_labels * 0.80) train_idxs = np.array(idx_list[:split_idx]) valid_idxs = np.array(idx_list[split_idx:]) # Give dataset the full data dataset.save(data, labels, metadata, data_per_label=1) # And the split sets dataset.add_split('train', train_idxs) dataset.add_split('valid', valid_idxs) dataset.clear_cache() print('LOADING FROM DATASET RAW') dataset.ensure_symlinked() return dataset
if __name__ == '__main__': """ CommandLine: python -m wbia_cnn.ingest_data python -m wbia_cnn.ingest_data --allexamples python -m wbia_cnn.ingest_data --allexamples --noface --nosrc """ import multiprocessing multiprocessing.freeze_support() # for win32 import utool as ut # NOQA ut.doctest_funcs()