Skip to content

Commit

Permalink
Merge pull request #487 from vanithakattumuri/main
Browse files Browse the repository at this point in the history
#2 #2 updated the TemporalDatabase.py code with minor changes (i.e., occurrenceProbabilityAtSameTimestamp and occurrenceProbabilityToSkipSubsequentTimestamp)
  • Loading branch information
udayRage authored Aug 3, 2024
2 parents 94fd3a4 + 72f7293 commit c0cbd85
Showing 1 changed file with 97 additions and 91 deletions.
188 changes: 97 additions & 91 deletions PAMI/extras/syntheticDataGenerator/TemporalDatabase.py
Original file line number Diff line number Diff line change
@@ -1,118 +1,109 @@
# TemporalDatabase is a collection of timestamps and along with data at particular time.
#
# **Importing this algorithm into a python program**
# --------------------------------------------------------
#
# from PAMI.extras.syntheticDataGenerator import TemporalDatabase as db
#
# temporalDB = db(numOfTransactions, avgTransactionLength, numItems, outFileName)
# temporalDB = db.TemporalDatabase(numOfTransactions, avgTransactionLength, numItems, outFileName, percentage, sep, occurrenceProbabilityAtSameTimestamp, occurrenceProbabilityToSkipSubsequentTimestamp)
#
# temporalDB.create(percentage)
# temporalDB.create()
#
#


__copyright__ = """
Copyright (C) 2021 Rage Uday Kiran
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""


import pandas as pd
import numpy as np
import sys


class TemporalDatabase:
"""
:Description: - creates a temporal database with required parameter (e.g.,numOfTransactions, avgLenOfTransactions, numItems and outputFile).
- output can be printed in two ways either in text file or dataframe depending on the input type.
:Attributes:
:param numOfTransactions: int
number of transactions
:param avgLenOfTransactions: int
average length of transactions
Creates a temporal database with transactions and timestamps.
:param numItems: int
number of items
This class generates a temporal database based on the given parameters and provides
options to output the database in either a text file or a DataFrame format.
:param outputFile: str
the name the output file
**Importing this algorithm into a Python program**
:param percentage: int
percentage of coinToss for TID of temporalDatabase
:param sep: str
seperator for database output file
from PAMI.extras.syntheticDataGenerator import TemporalDatabase as db
:param typeOfFile: str
specify database or dataframe to get corresponding output
temporalDB = db.TemporalDatabase(numOfTransactions, avgTransactionLength, numItems, outFileName, percentage, sep, occurrenceProbabilityAtSameTimestamp, occurrenceProbabilityToSkipSubsequentTimestamp)
:Methods:
getFileName():
returns filename
temporalDB.create()
createTemporalFile():
creates temporal database file or dataframe
getDatabaseAsDataFrame:
returns dataframe
performCoinFlip():
Perform a coin flip with the given probability
**Methods to execute code on terminal**
tuning():
Tune the arrayLength to match avgLenOfTransactions
Format:
createTemporalFile():
create Temporal database or dataframe depending on input
(.venv) $ python3 TemporalDatabase.py <numOfTransactions> <avgLenOfTransactions> <numItems> <outputFile> <percentage> <sep> <typeOfFile> <occurrenceProbabilityAtSameTimestamp> <occurrenceProbabilityToSkipSubsequentTimestamp>
**Methods to execute code on terminal**
---------------------------------------------
.. code-block:: console
Example Usage:
Format:
(.venv) $ python3 TemporalDatabase.py 50 10 100 temporal.txt 50 \t database 0.1 0.1
(.venv) $ python3 TemporalDatabase.py <numOfTransactions> <avgLenOfTransactions> <numItems> <outputFile>
:param numOfTransactions: int
Number of transactions to generate.
Example Usage:
:param avgLenOfTransactions: int
Average length of transactions.
(.venv) $ python3 TemporalDatabase.py 50.0 10.0 100 temporal.txt
:param numItems: int
Number of items in the database.
**Importing this algorithm into a python program**
--------------------------------------------------------
.. code-block:: python
:param outputFile: str
Name of the output file for the database.
from PAMI.extras.syntheticDataGenerator import TemporalDatabase as db
:param percentage: int
Percentage for the coin toss to decide if a transaction will be included in the output.
If the value is greater than 1, it is treated as a percentage (i.e., 50 for 50%).
temporalDB = db(numOfTransactions, avgTransactionLength, numItems, outFileName)
:param sep: str
Separator for the output file (default is tab).
temporalDB.create(percentage)
:param typeOfFile: str
Type of output file. Can be 'database' for a text file or 'dataframe' for a DataFrame output.
:param occurrenceProbabilityAtSameTimestamp: float
Probability that a new transaction will occur at the same timestamp as the previous one.
:param occurrenceProbabilityToSkipSubsequentTimestamp: float
Probability that the timestamp will be skipped for subsequent transactions.
"""
def __init__(self, numOfTransactions: int, avgLenOfTransactions: int,
numItems: int, outputFile: str, percentage: int=50,
sep: str='\t', typeOfFile: str="Database") -> None:


def __init__(self, numOfTransactions: int, avgLenOfTransactions: int,
numItems: int, outputFile: str, percentage: int = 50,
sep: str = '\t', typeOfFile: str = "Database",
occurrenceProbabilityAtSameTimestamp: float = 0.1,
occurrenceProbabilityToSkipSubsequentTimestamp: float = 0.1) -> None:
"""
Initialize the generateTemporalDatabase class with required parameters.
Initialize the TemporalDatabase with required parameters.
:param numOfTransactions: Number of transactions to generate.
:param avgLenOfTransactions: Average length of transactions.
:param numItems: Number of items in the database.
:param outputFile: Name of the output file for the database.
:param percentage: Percentage for the coin toss to include transactions.
:param sep: Separator for the output file.
:param typeOfFile: Type of output file ('database' or 'dataframe').
:param occurrenceProbabilityAtSameTimestamp: Probability for same timestamp.
:param occurrenceProbabilityToSkipSubsequentTimestamp: Probability to skip subsequent timestamp.
"""

self.numOfTransactions = numOfTransactions
Expand All @@ -125,90 +116,96 @@ def __init__(self, numOfTransactions: int, avgLenOfTransactions: int,
self.percentage = percentage
self.sep = sep
self.typeOfFile = typeOfFile.lower()
self.occurrenceProbabilityAtSameTimestamp = occurrenceProbabilityAtSameTimestamp
self.occurrenceProbabilityToSkipSubsequentTimestamp = occurrenceProbabilityToSkipSubsequentTimestamp

def getFileName(self) -> str:
"""
This function take the name of the outputfile.
:return: outputFile.
Returns the name of the output file.
:return: Output file name.
"""
return self.outputFile

def getDatabaseAsDataFrame(self) -> pd.DataFrame:
"""
This function return the database in dataframe format.
Returns the database as a DataFrame.
return: pd.dataframe
:return: pd.DataFrame containing the temporal database.
"""
return self.df

def performCoinFlip(self, probability: float) -> bool:
"""Perform a coin flip with the given probability."""
"""
Perform a coin flip with the given probability.
:param probability: Probability of the coin landing heads (i.e., the event occurring).
:return: True if the coin lands heads, False otherwise.
"""
result = np.random.choice([0, 1], p=[1 - probability, probability])
return result == 1


def tuning(self, array, sumRes) -> list:
"""
Tune the array so that the sum of the values is equal to sumRes
Tune the array to ensure that the sum of the values equals sumRes.
Parameters:
:param array: list of values randomly generated.
:param array: List of values to be tuned.
:type array: list
:param sumRes: target sum
:param sumRes: Target sum for the array values.
:type sumRes: int
Returns:
array: list - tuned array
:return: Tuned list of values.
"""
# first generate a random array of length n whose values average to m
values = np.random.randint(1, self.numItems, len(array))

while np.sum(values) != sumRes:
# get index of largest value
# if sum is too large, decrease the largest value
if np.sum(values) > sumRes:
maxIndex = np.argmax(values)
values[maxIndex] -= 1
# if sum is too small, increase the smallest value
else:
minIndex = np.argmin(values)
values[minIndex] += 1

# get location of all values greater than numItems

for i in range(len(array)):
array[i][1] = values[i]

return array

def create(self) -> None:
"""
create Temporal database or dataframe depending on type of file specified.
:return: None
Create the temporal database or DataFrame based on the specified type of file.
"""

db = []
lineSize = []
self.current_timestamp = 0 # Initialize current timestamp

for i in range(self.numOfTransactions):
db.append([i])
if self.performCoinFlip(self.occurrenceProbabilityAtSameTimestamp):
timestamp = self.current_timestamp
else:
if self.performCoinFlip(self.occurrenceProbabilityToSkipSubsequentTimestamp):
self.current_timestamp += 2
else:
self.current_timestamp += 1
timestamp = self.current_timestamp

db.append([timestamp])
if self.performCoinFlip(self.percentage):
lineSize.append([i,0])

# make it so that sum of lineSize[1] equal to numTransactions * avgLenOfTransactions
lineSize.append([i, 0])

sumRes = self.numOfTransactions * self.avgLenOfTransactions
self.tuning(lineSize, sumRes)

for i in range(len(lineSize)):
if lineSize[i][1] > self.numItems:
raise ValueError("Error: Either increase numItems or decrease avgLenOfTransactions or modify percentage")
raise ValueError(
"Error: Either increase numItems or decrease avgLenOfTransactions or modify percentage")
line = np.random.choice(range(1, self.numItems + 1), lineSize[i][1], replace=False)
db[lineSize[i][0]].extend(line)

if self.typeOfFile == "database":
with open(self.outputFile, "w") as outFile:
for line in db:
outFile.write(self.sep.join(map(str, line)) + '\n')
outFile.close()

if self.typeOfFile == "dataframe":
data = {
Expand All @@ -219,7 +216,16 @@ def create(self) -> None:

print("Temporal database created successfully")

if __name__ == '__main__':

obj = TemporalDatabase(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
obj.create(sys.argv[5])
if __name__ == '__main__':
if len(sys.argv) != 10:
print("Usage: python TemporalDatabase.py <numOfTransactions> <avgLenOfTransactions> <numItems> <outputFile> <percentage> <sep> <typeOfFile> <occurrenceProbabilityAtSameTimestamp> <occurrenceProbabilityToSkipSubsequentTimestamp>")
sys.exit(1)

obj = TemporalDatabase(
int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]), sys.argv[4],
percentage=int(sys.argv[5]), sep=sys.argv[6], typeOfFile=sys.argv[7],
occurrenceProbabilityAtSameTimestamp=float(sys.argv[8]),
occurrenceProbabilityToSkipSubsequentTimestamp=float(sys.argv[9])
)
obj.create()

0 comments on commit c0cbd85

Please sign in to comment.