aboutsummaryrefslogtreecommitdiffstats
path: root/asdctool/src/main/resources/scripts/python/duplicatesAndRemove.py
blob: a4bd35dd2b2beddaa076af0e00d6dcb332937e15 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import json
import sys, getopt
from collections import OrderedDict

dict = {} 
dupliacteUid = {} 
#debugFlag = True 
debugFlag = False 

def join_strings(lst):
    concat = ""
    for string in lst:
	if (string != None):
		if (type(string) == int):
			string = str(string)
        	concat += (string + " ")
    return concat

def debug(desc, *args):
	'print only if debug enabled'
	if (debugFlag == True): 
		print desc, join_strings(args)  

def log(desc, arg):
	'print log info'
	print desc, arg  

def getUid(vertex):
	uid = None
	nodeLabel=vertex.get('nodeLabel')
	debug(nodeLabel)
	if ( nodeLabel == 'user' ):
		uid = vertex['userId']
	elif ( nodeLabel == 'tag' ):
		uid = vertex['name']
	elif ( nodeLabel == None ):
		pass
	elif ( nodeLabel == 'lockNode' ):
		uid = vertex.get('uid')
	else: uid = vertex['uid']

	debug(nodeLabel, uid)

	return uid	   

def generateFile(inputFile, outputFile):
	
	with open(inputFile) as json_file:
		dupliacteUid = {}
		json_data = json.load(json_file)
		for x in json_data['vertices']:
			uid = getUid(x)

			existId = dict.get(uid)
			if (existId == None):
				dict[uid] = x.get('_id')
			else:
				dupliacteUid[uid] = existId 	    

		log("duplicate ids", dupliacteUid)

		json_data_vertices = json_data['vertices']
		log("number of vertices is", len(json_data_vertices))

		ids = {} 
		deleteIndexes = []
		 
		for i in xrange(len(json_data_vertices)):
		#print "****** ", i, " *************"
		#print json_data_vertices[i]
			id = json_data_vertices[i]["_id"]
			uid = getUid(json_data_vertices[i])
			isDuplicateId = dupliacteUid.get(uid)
			if (isDuplicateId != None):
				debug("uid to id pair", uid if uid != None else 'None', id)
				value = ids.get(uid)
				if (value == None):
					list = [id,]	
					ids[uid] = list 
				else:
					value.append(id)	
					deleteIndexes.append(id)	

		log("ids", ids)
		log("deleteIndexes", deleteIndexes)
		log("deleteIndexes size", len(deleteIndexes))

		filter_vertex = [ x for x in json_data_vertices if x.get('_id') not in deleteIndexes ]
		json_data['vertices'] = filter_vertex	

		log("number of vertexes after filter", len(filter_vertex))

		json_data_edges = json_data['edges']

		log("number of edges", len(json_data_edges))
		
		filter_edge = [ x for x in json_data_edges if x['_outV'] not in (deleteIndexes) and x['_inV'] not in (deleteIndexes) ]
                json_data['edges'] = filter_edge

		log("number of edges after filter", len(json_data['edges']))

		json_data = OrderedDict(sorted(json_data.items(), key=lambda t: t[0], reverse=True))	
	
		with open(outputFile, 'w') as outfile:
			#json.dump(json_data, outfile)
			json.dump(json_data, outfile)
		log("output file is", outputFile);

def main(argv):
        print 'Number of arguments:', len(sys.argv), 'arguments.'
        inputfile = None 
        outputfile = ''
        try:
                opts, args = getopt.getopt(argv,"h:i:o:",["ifile=","ofile="])
        except getopt.GetoptError:
                print sys.argv[0], '-i <inputfile>'
                sys.exit(2)
        for opt, arg in opts:
                if opt == '-h':
                        print sys.argv[0], '-i <inputfile>'
                        sys.exit(3)
                elif opt in ("-i", "--ifile"):
                        inputfile = arg

        if ( inputfile == None ):
                print sys.argv[0] ,'-i <inputfile>'
                sys.exit(3)

        print 'Input file is "', inputfile
        generateFile(inputfile, inputfile + '.noduplicates')
		

if __name__ == "__main__":
        main(sys.argv[1:])
    
#	print x['uid']