# -*- coding: utf-8 -*- """ Created on Tue Nov 22 17:20:31 2016 @author: cgalli """ import numpy as np import time start_time = time.time() # Generate random arrays data1 = np.random.rand(1000 * 1000) data2 = np.random.rand(1000 * 1000) #consider 1d array first diff = np.abs(data1 - data2) # At this point we are done needing the arrays of data1 and data2 # all "work" will be done on the diff array # Here we get the index positions of the diff array sorted. # we don't care about sorting the array for data values, we only care # that we can find the 10 smallest and 10 largest values # by using positions 0:10 and -10:: (the last 10) # In IDL the SORT(diff) provides the index positions just as numpy argsort does. # http://www.harrisgeospatial.com/docs/sort.html r = np.argsort(diff) # in the returned index positions, # we now know that the first 10 elements represent the smallest values, # and the last 10 elements are the largest ones #min_vals = diff[r[0:10]] # we don't need this, but it might be useful to verify things later min_pos = r[0:10] #max_vals = diff[r[-10::]] # we don't need this, but it might be useful to verify things later max_pos = r[-10::] #the above min_pos and max_pos index positions represent positions based on a flat 1D data array of 1000 X 1000 # but the 1000 x 1000 index positions of a 2D array is meaningful for this exercise. # we can easily create index positions for the 10 minimum value locations. # In fact, we can use the modulus of 1000 to know the X coordinate, and # dividing by 1000 gives us the Y coordinate. #''' #For example, if I created an array of 1 through 9 as 1 dimensional (1D), it would look like this: #[0, 1, 2, 3, 4, 5, 6, 7, 8] #We can also say this could be a 2D array of 3 x 3 rows and columns like this: #[0, 1, 2] #[3, 4, 5] #[6, 7, 8] #The data is the same, and the positions are the same IF it was still 1D. Assuming that #the first position of 0 is coordinate [0,0] in the new 2D array, we can apply the above #modulus and division to get index positions. If we wanted to know where the location of #5 is located in the 2D array, we take this: #x position = 5 % 3, which equals 2 #y position = 5 / 3, which equals 1 (remember division of integers will truncate, which is what we wanted) #If the index positoin of 5 was in our min_pos from above, we can now say it has the coordinate system #of (x, y) = (2, 1). So array1D[2, 1] = 5. #This relationship allows us to find the distance between any two points in the 2D cartesian space by using #a very basic distance formula of d = sqrt((y2 - y1)**2 + (x2 - x1)**2). #''' #of the 10 smallest, find distance, but first get the positions so we have an x and y "point" xpos = (min_pos % 1000) ypos = (min_pos / 1000) #Up to now, we have only written 6 lines of code to represent all of our relationships # to find the distances of minimum values. #the next part is up to you on how to approach. Because we only have points to compare, # we might not care that looping over 10 points recursively ends up with a O(n**2) complexity. # in fact, trying to get only 10 data points into a form that is more elegant in complexity, # will probably result in adding more time because the "setup" or "approach" is going to # require more overhead than looping 10**2 times. So here's one approach in python. # First, I want a set of x,y pairs to compare. #''' #For example, if I had these two arrays from xpos and ypos: #xpos = [10,23,54,18,40] #ypos = [45,11,76,13,16] #I have x,y pairs like (10,45) and (23,11) and (54,76) and so on... #So comparing one point to another with a basic distance equation yields distance in meters #if we assumed index positions are meters. Remember we said the 1000 x 1000 was 1 square kilometer. #''' #reshape xpos and ypos to pairs of points d = np.array([xpos,ypos]).reshape(10,2) #now loop 10 iterations and loop the same amount in a nested loop # it only takes 9.9 micro seconds. That's 0.00099 seconds. It passes the "good enough test." :) #set a large minimum distance to start with min_dist = 99999 mpos = (-1,-1) t2 = time.time() for i in xrange(len(xpos)): point1 = d[i,:] #compare this point to all others for j in xrange(len(xpos)): if i == j: # no need to compare itself to itself! # so continue onto next inner loop iteration continue point2 = d[j,:] this_dist = np.sqrt((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2) if this_dist < min_dist: min_dist = this_dist mpos = (point1, point2) print time.time() - start_time print 'The terrible O(n**2) approach took this much time:', time.time() - t2 print 'positions and distance of closest minimum values from 10 points', mpos, min_dist #then do the same to find the maximum distance for the maximum values. #Note: we could probably find a clever way to make this O(n**2) problem a little easier, but again # this requires overhead to setup the problem, and that setup is likely more expensive than # just looping 100 times. So I might conclude that this is an optimzed "good enough" version of code. # It passes the test of "Is this practical and good enough?" I think yes.