-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvector_space_construction.py
55 lines (39 loc) · 1.97 KB
/
vector_space_construction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from db_manager import get_apk_features, get_apks, get_datasets_apks
import numpy as np
from scipy import sparse as sp
from sklearn.model_selection import train_test_split
from utils import RANDOM_STATE
def get_vector_space_dict(dimensions: list) -> dict:
ordered_dimensions = np.sort(dimensions)
indices = range(ordered_dimensions.size)
return {dimension: indice for dimension, indice in zip(ordered_dimensions, indices)}
def get_app_sparse_feature_vector(vector_space_dict: dict, apk_id: int) -> list:
"""getting the apk_features from the DB and then returning the indices of the features in the VS"""
apk_features = get_apk_features(apk_id)
return [vector_space_dict[feature]
for feature in apk_features
if feature in vector_space_dict]
def get_X_y(vector_space_dict: dict, apks: list):
""" apks = [(apk_id, malignity) ...], apk_id : int and malignity == 0 or 1"""
nb_apks = len(apks)
nb_dimensions = len(vector_space_dict)
X = sp.lil_matrix((nb_apks, nb_dimensions), dtype=np.int8)
y = np.zeros(nb_apks, dtype=np.int8)
apk_ids, apk_malignities = map(list, zip(*apks))
for row, apk_id, apk_malignity in zip(range(nb_apks), apk_ids, apk_malignities):
features_indices = get_app_sparse_feature_vector(vector_space_dict, apk_id)
X[row, np.sort(features_indices)] = 1
if apk_malignity == 1:
y[row] = 1
return X.tocsr(), y
def get_X_y_datasets(vector_space_dict: dict, datasets: list):
apks = get_datasets_apks(datasets)
return get_X_y(vector_space_dict, apks)
def get_all_X_y(dimensions: list):
"""get the vectorized version of all APKs in the DB"""
vector_space_dict = get_vector_space_dict(dimensions)
apks = get_apks()
return get_X_y(vector_space_dict, apks)
def train_test_split_apks(apks: list, train_size: float = 3 / 4):
apks_train, apks_test = train_test_split(apks, train_size=train_size, random_state=RANDOM_STATE)
return apks_train, apks_test