Jupyter-nifi.html

<!DOCTYPE html>
<html lang="en">

<head>
  <meta charset="utf-8">
  <meta content="width=device-width, initial-scale=1.0" name="viewport">

  <title>Jupyter setup</title>
  <meta content="" name="description">
  <meta content="" name="keywords">

  <!-- Favicons -->
  <link href="assets/img/Favicon-1.png" rel="icon">
  <link href="assets/img/Favicon-1.png" rel="apple-touch-icon">

  <!-- Google Fonts -->
  <link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,500,500i,600,600i,700,700i|Poppins:300,300i,400,400i,500,500i,600,600i,700,700i" rel="stylesheet">

  <!-- Vendor CSS Files -->
  <link href="assets/vendor/aos/aos.css" rel="stylesheet">
  <link href="assets/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
  <link href="assets/vendor/bootstrap-icons/bootstrap-icons.css" rel="stylesheet">
  <link href="assets/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
  <link href="assets/vendor/glightbox/css/glightbox.min.css" rel="stylesheet">
  <link href="assets/vendor/swiper/swiper-bundle.min.css" rel="stylesheet">
  <!-- Creating a python code section-->
  <link rel="stylesheet" href="assets/css/prism.css">
  <script src="assets/js/prism.js"></script>

  <!-- Template Main CSS File -->
  <link href="assets/css/style.css" rel="stylesheet">


  <!-- To set the icon, visit https://fontawesome.com/account-->
  <script src="https://kit.fontawesome.com/5d25c1efd3.js" crossorigin="anonymous"></script>
  
  <!-- end of icon-->

  <!-- =======================================================
  * Template Name: iPortfolio
  * Updated: Sep 18 2023 with Bootstrap v5.3.2
  * Template URL: https://bootstrapmade.com/iportfolio-bootstrap-portfolio-websites-template/
  * Author: BootstrapMade.com
  * License: https://bootstrapmade.com/license/
  ======================================================== -->
</head>

<body>

  <!-- ======= Mobile nav toggle button ======= -->
  <i class="bi bi-list mobile-nav-toggle d-xl-none"></i>

  <!-- ======= Header ======= -->
  <header id="header">
    <div class="d-flex flex-column">

      <div class="profile">
        <img src="assets/img/myphoto.jpeg" alt="" class="img-fluid rounded-circle">
        <h1 class="text-light"><a href="index.html">Arun</a></h1>
        <div class="social-links mt-3 text-center">
          <a href="https://www.linkedin.com/in/arunp77/" target="_blank" class="linkedin"><i class="bx bxl-linkedin"></i></a>
          <a href="https://github.com/arunp77" target="_blank" class="github"><i class="bx bxl-github"></i></a>
          <a href="https://twitter.com/arunp77_" target="_blank" class="twitter"><i class="bx bxl-twitter"></i></a>
          <a href="https://www.instagram.com/arunp77/" target="_blank" class="instagram"><i class="bx bxl-instagram"></i></a>
          <a href="https://arunp77.medium.com/" target="_blank" class="medium"><i class="bx bxl-medium"></i></a>
        </div>
      </div>

      <nav id="navbar" class="nav-menu navbar">
        <ul>
          <li><a href="index.html#hero" class="nav-link scrollto active"><i class="bx bx-home"></i> <span>Home</span></a></li>
          <li><a href="index.html#about" class="nav-link scrollto"><i class="bx bx-user"></i> <span>About</span></a></li>
          <li><a href="index.html#resume" class="nav-link scrollto"><i class="bx bx-file-blank"></i> <span>Resume</span></a></li>
          <li><a href="index.html#portfolio" class="nav-link scrollto"><i class="bx bx-book-content"></i> <span>Portfolio</span></a></li>
          <li><a href="index.html#skills-and-tools" class="nav-link scrollto"><i class="bx bx-wrench"></i> <span>Skills and Tools</span></a></li>
          <li><a href="index.html#language" class="nav-link scrollto"><i class="bi bi-menu-up"></i> <span>Languages</span></a></li>
          <li><a href="index.html#awards" class="nav-link scrollto"><i class="bi bi-award-fill"></i> <span>Awards</span></a></li>
          <li><a href="index.html#professionalcourses" class="nav-link scrollto"><i class="bx bx-book-alt"></i> <span>Professional Certification</span></a></li>
          <li><a href="index.html#publications" class="nav-link scrollto"><i class="bx bx-news"></i> <span>Publications</span></a></li>
          <li><a href="index.html#extra-curricular" class="nav-link scrollto"><i class="bx bx-rocket"></i> <span>Extra-Curricular Activities</span></a></li>
          <!-- <li><a href="#contact" class="nav-link scrollto"><i class="bx bx-envelope"></i> <span>Contact</span></a></li> -->
        </ul>
      </nav><!-- .nav-menu -->
    </div>
  </header><!-- End Header -->

<main id="main">

    <!-- ======= Breadcrumbs ======= -->
    <section id="breadcrumbs" class="breadcrumbs">
      <div class="container">

        <div class="d-flex justify-content-between align-items-center">
          <h2>Real-time data streaming</h2>
          <ol>
            <li><a href="portfolio-details-8.html" class="clickable-box"><i class="fas fa-arrow-left"></i> Go to Project page </i></a></li>
            <li><a href="content-page.html" class="clickable-box">Go to content <i class="fas fa-arrow-right"></i></a></li>
          </ol>
        </div>

      </div>
    </section><!-- End Breadcrumbs -->

    <!------  right dropdown menue ------->
    <div class="right-side-list">
      <div class="dropdown">
          <button class="dropbtn"><strong>Shortcuts:</strong></button>
          <div class="dropdown-content">
              <ul>
                  <li><a href="cloud-compute.html"><i class="fas fa-cloud"></i> Cloud</a></li>
                  <li><a href="AWS-GCP.html"><i class="fas fa-cloud"></i> AWS-GCP</a></li>
                  <li><a href="amazon-s3.html"><i class="fas fa-cloud"></i> AWS S3</a></li>
                  <li><a href="ec2-confi.html"><i class="fas fa-server"></i> EC2</a></li>
                  <li><a href="Docker-Container.html"><i class="fab fa-docker" style="color: rgb(5, 5, 5);"></i> Docker</a></li>
                  <li><a href="Jupyter-nifi.html"><i class="fab fa-python" style="color: rgb(12, 12, 12);"></i> Jupyter-nifi</a></li>
                  <li><a href="snowflake-task-stream.html"><i class="fas fa-snowflake"></i> Snowflake</a></li>
                  <li><a href="data-model.html"><i class="fas fa-database"></i> Data modeling</a></li>
                  <li><a href="sql-basics.html"><i class="fas fa-table"></i> QL</a></li>
                  <li><a href="sql-basic-details.html"><i class="fas fa-database"></i> SQL</a></li>
                  <li><a href="Bigquerry-sql.html"><i class="fas fa-database"></i> Bigquerry</a></li>
                  <li><a href="scd.html"><i class="fas fa-archive"></i> SCD</a></li>
                  <li><a href="sql-project.html"><i class="fas fa-database"></i> SQL project</a></li>
                    <!-- Add more subsections as needed -->
                </ul>
          </div>
        </div>
    </div>

    <!-- ======= Portfolio Details Section ======= -->
    <section id="portfolio-details" class="portfolio-details">
        <div class="container">
            <div class="row gy-4">
                <h1>JupyterLab and Apache Nifi Configuration</h1>
                    <div class="col-lg-8">
                        <div class="portfolio-details-slider swiper">
                            <div class="swiper-wrapper align-items-center">
                    
                                <div class="swiper-slide">
                                    <img src="assets/img/portfolio/jupyterlab-logo.png" class="img-fluid" alt="Image Description" style="width: 200px; height: 230px;">
                                </div>
                    
                                <div class="swiper-slide">
                                    <img src="assets/img/portfolio/Apache-nifi-logo.png" alt="Image Description" style="width: 350px; height: 200px;">
                                </div>
                    
                                <div class="swiper-slide">
                                    <img src="assets/img/portfolio/Apache_ZooKeeper_logo.png" alt="Image Description" style="width: 400px; height: 200px;">
                                </div>
                            </div>
                            <div class="swiper-pagination"></div>
                        </div>
                    </div>

                    <!-- setting up the Jupyterlab -->
                    <h2><a href="https://jupyter.org/" target="_blank"><strong>Introduction to Jupterlab</strong></a></h2>
                    <p>JupyterLab is a web-based user interface for working with Jupyter notebooks, code, data, and other documents. 
                      It allows you to create, edit, run, and share your notebooks, as well as explore and visualize your data in various
                      ways. JupyterLab is designed to be flexible, integrated, and extensible, with a modular architecture that supports 
                      extensions and custom components. </p>

                      <img src="assets/img/portfolio/jupyterlab-home.png" alt="Image Description" style="width: 700px; height: 400px;">
                      

                    <h4><strong>Setting up Jupyter lab on EC2 machine</strong></h3>
                    <ul style="list-style-type: disc; margin-left: 30px;">
                        <li>Let's review first. After creating the ec2 machine, you need to do following in order (this decription is for <strong>Amazon Linux</strong>.
                           For other OS, you can look at the official aws ec2 machine):
                          <ol type="I">
                            <li>Update the ec2 machine </li>
                            <pre><code>sudo yum update -y</code></pre>
                            <li>Install docker</li>
                            <pre><code>sudo yum install docker</code></pre>
                            <li>Transfer the docker-compose.yml file to ec2 instance (for more details see <a href="Docker-Container.html" target="_blank">Docker configuration page</a>).</li>
                            <pre><code>scp -i [your-key.pem] docker-compose.yml ec2-user@[your-ec2-instance-ip]:/path/to/destination</code></pre>
                            <li>Create a docker container and docker user:</li>
                            <pre><code>sudo gpasswd -a $USER docker</code></pre>
                            <pre><code>newgrp docker</code></pre>
                            <li>Next start the docker:</li>
                            <pre><code>sudo systemctl start docker</code></pre>
                            <li>Once you have setup the Jupyterlab on the ec2 machine, you just need to run `docker-compose up -d` so as to start the Jupyter lab kernal 
                              (The `-d` flag runs the containers in detached mode, allowing them to run in the background).</li>
                            <pre><code>docker-compose up -d</code></pre>
                            <li>Install python:</li>
                            <pre><code>sudo yum install python-pip</code></pre>
                            
                          </ol>
                        </li>
                        
                        <li>Once the Docker Compose services are up and running, you can access JupyterLab in your web browser. By default, JupyterLab is often available at port 8888.</li>
                        <li>Open a web browser and navigate to:</li>
                            <pre><code></code>http://[your-ec2-instance-ip]:4888/lab?</pre>
                          <p>When you access JupyterLab for the first time, it maybe asked to provide an authentication token or password. You can find this token in the logs or terminal 
                            output when you ran docker-compose up `-d`. Look for a line that starts with "token=" and contains a long string of characters. Copy and paste this 
                            token when prompted in your web browser.</p>
                        <li> Once you've authenticated, you can start using JupyterLab to run notebooks and perform data analysis or development tasks.</li>
                        <li>In our cases, we have generated a fake dataset using the python library 'Faker'. To install this library on ec2 machine, use:</li>
                            <pre>
                                <code>
                                    # Install Faker library
                                    pip install faker
                                </code>
                            </pre>
                            and generate the data of list of customers (customer_id, First name, Last name, email, street, city, state, country) using the folowing code:
                            <pre class="language-python"><code>
                                    from faker import Faker
                                    import csv
                                    import random
                                    from decimal import Decimal
                                    from datetime import datetime
                                    RECORD_COUNT = 10000
                                    fake = Faker()

                                    current_time = datetime.now().strftime("%Y%m%d%H%M%S")
                                    print(current_time)

                                    def create_csv_file():
                                    with open(f'FakeDataset/customer_{current_time}.csv', 'w', newline='') as csvfile:
                                        fieldnames = ["customer_id","first_name","last_name","email","street",
                                                    "city","state","country"
                                                    ]
                                        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                                        writer.writeheader()
                                        for i in range(RECORD_COUNT):
                                            #print(i)
                                            writer.writerow(
                                                {
                                                    "customer_id": i,#fake.random_int(min=1, max=10000),
                                                    'first_name': fake.first_name(),
                                                    'last_name': fake.last_name(),
                                                    'email': fake.email(),
                                                    'street': fake.street_address(),
                                                    'city': fake.city(),
                                                    'state': fake.state(),
                                                    'country': fake.country()
                                                }
                                            )
                                    if __name__ == '__main__':
                                    create_csv_file()
                                </code></pre>
                            <li>So, the generated CSV files will be saved in a directory named "FakeDataset" within the directory where your JupyterLab notebook is running. </li>

                    </ul>
                    
                    <!-- setting up the Apache Nifi --> 
                    <h2><a href="https://nifi.apache.org/docs.html" target="_blank"><strong>Introduction to Apache Nifi</strong></a></h2>
                    <ul>
                    <p>Apache NiFi is an open-source data integration tool that provides an intuitive and powerful way to automate the flow of data between systems. It was 
                      developed by the Apache Software Foundation and is designed to solve the challenges of data routing, transformation, and system connectivity in a 
                      scalable and extensible manner. NiFi's primary goal is to enable the automation of data flow, making it an essential tool for data ingestion, 
                      transformation, and routing in various scenarios.</p>
                      <p>Apache NiFi is a real time data ingestion platform, which can transfer and manage data transfer between different sources and destination systems.
                        For configuration, you can look at <a href="https://www.tutorialspoint.com/apache_nifi/index.htm" target="_blank">Apache Nifi Tutorials</a>,
                      </p>
                    </ul>

                    <ol style="margin-left: 30px;">
                      <li><b>Apache NiFi- General Features:</b> </li>
                        <ul>
                          <li>Apache NiFi provides a web-based user interface, which provides seamless experience between design, control, feedback, and monitoring.</li>
                          <li>It is highly configurable. This helps users with guaranteed delivery, low latency, high throughput, dynamic prioritization, back pressure 
                            and modify flows on runtime.</li>
                          <li>It also provides data provenance module to track and monitor data from the start to the end of the flow.</li>
                          <li>Developers can create their own custom processors and reporting tasks according to their needs.</li>
                          <li>NiFi also provides support to secure protocols like SSL, HTTPS, SSH and other encryptions.</li>
                          <li>It also supports user and role management and also can be configured with LDAP for authorization.</li>
                        </ul>
                      <li><b>Apache NiFi -Key Concepts:</b></li>
                        <ul>
                          <li><strong>Process Group</strong> − It is a group of NiFi flows, which helps a user to manage and keep flows in hierarchical manner.</li>
                          <li><strong>Flow</strong> − It is created connecting different processors to transfer and modify data if required from one data source or sources to 
                            another destination data sources.</li>
                          <li><strong>Processor</strong> − A processor is a java module responsible for either fetching data from sourcing system or storing it in destination 
                            system. Other processors are also used to add attributes or change content in flowfile.</li>
                          <li><strong>Flowfile</strong> − It is the basic usage of NiFi, which represents the single object of the data picked from source system in NiFi. 
                            NiFiprocessormakes changes to flowfile while it moves from the source processor to the destination. Different events like CREATE, 
                            CLONE, RECEIVE, etc. are performed on flowfile by different processors in a flow.</li>
                          <li><strong>Event</strong> − Events represent the change in flowfile while traversing through a NiFi Flow. These events are tracked in data provenance.</li>
                          <li><strong>Data provenance</strong> − It is a repository.It also has a UI, which enables users to check the information about a flowfile and helps in 
                            troubleshooting if any issues that arise during the processing of a flowfile.</li>
                        </ul>
                      <li><strong>Apache NiFi Advantages:</strong></ul>
                        <ul>
                            <li>Apache NiFi enables data fetching from remote machines by using SFTP and guarantees data lineage.</li>
                            <li>Apache NiFi supports clustering, so it can work on multiple nodes with the same flow processing different data, which increases the performance of data processing.</li>
                            <li>It also provides security policies on the user level, process group level, and other modules too.</li>
                            <li>Its UI can also run on HTTPS, which makes the interaction of users with NiFi secure.</li>
                            <li>NiFi supports around 188 processors, and a user can also create custom plugins to support a wide variety of data systems.</li>
                        </ul>
                      <li><strong>Apache NiFi Disadvantages:</strong></li>
                        <ul>
                          <li>When node gets disconnected from NiFi cluster while a user is making any changes in it, then the flow.xml becomes invalid.Anode cannot 
                            connect back to the cluster unless admin manually copies flow.xml from the connected node.</li>
                          <li>Apache NiFi have state persistence issue in case of primary node switch, which sometimes makes processors not able to fetch data from sourcing systems.</li>
                        </ul>
                    </ol>
                    <uL>
                      <p><strong>Apache NiFi - Processors Categorization</strong></p>
                      <p>Various available processors in Apache Nifi are as follows:</p>
                    </uL>
                    <table>
                      <tr>
                        <th>Processor Category</th>
                        <th>Processor category Explanation</th>
                        <th>Processor Names</th>
                      </tr>
                      <tr>
                        <td>Data Ingestion Processors</td>
                        <td>The processors under Data Ingestion category are used to ingest data into the NiFi data flow. These are mainly the starting point of any data flow in Apache NiFi. Some of the processors that belong to these categories are GetFile, GetHTTP, GetFTP, GetKAFKA, etc.</td>
                        <td>GetFile, GetHTTP, GetFTP, GetKAFKA, etc.</td>
                      </tr>
                      <tr>
                        <td>Routing and Mediation Processors</td>
                        <td>Routing and Mediation processors are used to route the flowfiles to different processors or data flows according to the information in attributes or content of those flowfiles. These processors are also responsible to control the NiFi data flows. Some of the processors that belong to this category are RouteOnAttribute, RouteOnContent, ControlRate, RouteText, etc.</td>
                        <td>RouteOnAttribute, RouteOnContent, ControlRate, RouteText, etc.</td>
                      </tr>
                      <tr>
                        <td>Database Access Processors</td>
                        <td>The processors of this Database Access category are capable of selecting or inserting data or executing and preparing other SQL statements from the database. These processors mainly use the data connection pool controller setting of Apache NiFi. Some of the processors that belong to this category are ExecuteSQL, PutSQL, PutDatabaseRecord, ListDatabaseTables, etc.</td>
                        <td>ExecuteSQL, PutSQL, PutDatabaseRecord, ListDatabaseTables, etc.</td>
                      </tr>
                      <tr>
                        <td>Attribute Extraction Processors</td>
                        <td>Attribute Extraction Processors are responsible for extracting, analyzing, and changing flowfile attributes processing in the NiFi data flow. Some of the processors that belong to this category are UpdateAttribute, EvaluateJSONPath, ExtractText, AttributesToJSON, etc.</td>
                        <td>UpdateAttribute, EvaluateJSONPath, ExtractText, AttributesToJSON, etc.</td>
                      </tr>
                      <tr>
                        <td>System Interaction Processors</td>
                        <td>System Interaction processors are used to run processes or commands in any operating system. These processors also run scripts in many languages to interact with a variety of systems. Some of the processors that belong to this category are ExecuteScript, ExecuteProcess, ExecuteGroovyScript, ExecuteStreamCommand, etc.</td>
                        <td>ExecuteScript, ExecuteProcess, ExecuteGroovyScript, ExecuteStreamCommand, etc.</td>
                      </tr>
                      <tr>
                        <td>Data Transformation Processors</td>
                        <td>Processors that belong to Data Transformation are capable of altering the content of the flowfiles. These can be used to fully replace the data of a flowfile, normally used when a user has to send a flowfile as an HTTP body to InvokeHTTP processor. Some of the processors that belong to this category are ReplaceText, JoltTransformJSON, etc.</td>
                        <td>ReplaceText, JoltTransformJSON, etc.</td>
                      </tr>
                      <tr>
                        <td>Sending Data Processors</td>
                        <td>Sending Data Processors are generally the end processors in a data flow. These processors are responsible for storing or sending data to the destination server. After successfully storing or sending the data, these processors DROP the flowfile with the success relationship. Some of the processors that belong to this category are PutEmail, PutKafka, PutSFTP, PutFile, PutFTP, etc.</td>
                        <td>PutEmail, PutKafka, PutSFTP, PutFile, PutFTP, etc.</td>
                      </tr>
                      <tr>
                        <td>Splitting and Aggregation Processors</td>
                        <td>These processors are used to split and merge the content present in a flowfile. Some of the processors that belong to this category are SplitText, SplitJson, SplitXml, MergeContent, SplitContent, etc.</td>
                        <td>SplitText, SplitJson, SplitXml, MergeContent, SplitContent, etc.</td>
                      </tr>
                      <tr>
                        <td>HTTP Processors</td>
                        <td>These processors deal with HTTP and HTTPS calls. Some of the processors that belong to this category are InvokeHTTP, PostHTTP, ListenHTTP, etc.</td>
                        <td>InvokeHTTP, PostHTTP, ListenHTTP, etc.</td>
                      </tr>
                      <tr>
                        <td>AWS Processors</td>
                        <td>AWS processors are responsible for interacting with the Amazon Web Services system. Some of the processors that belong to this category are GetSQS, PutSNS, PutS3Object, FetchS3Object, etc.</td>
                        <td>GetSQS, PutSNS, PutS3Object, FetchS3Object, etc.</td>
                      </tr>
                    </table>
                    

                    <strong>NiFi UI interface:</strong>
                        <img src="assets/img/portfolio/apache-nifi-home-2.png" alt="Image Description" style="width: 1000px; height: 520px;">
                    

                    <h4><strong>Setting up Apache nifi on EC2 machine</strong></h4>
                    <i>Using Apache NiFi to Transfer CSV Files to AWS S3</i>
                    <ul style="list-style-type: disc; margin-left: 30px;">
                        <li><b>Step 1: Prepare Your EC2 Instance</b></li>
                        <p>Ensure that you have Apache NiFi installed and running on your EC2 instance. You can download and set up Apache NiFi by following the official 
                          documentation: <a href="https://nifi.apache.org/docs/nifi-docs/html/getting-started.html">NiFi Getting Started</a></p> 
                        <li><b>Step 2: Create a NiFi Flow::</b></li>
                        <p>1. Start the NiFi UI by navigating to <a href="http://your-ec2-instance-ip:8080/nifi">http://your-ec2-instance-ip:2080/nifi</a>.</p>
                        <p>2. In the NiFi UI, you need to create a data flow that fetches and uploads the CSV files to your S3 bucket.</p>
                        
                        <p></p>
                        <p>3. Create a processor group (a container for processors) in NiFi to organize your flow. Drag the 'processor' to the canvas.</p>

                        <img src="assets/img/portfolio/apache-nifi-processor.png" alt="Image Description">

                        <p>You can create as many processors as you need:</p>

                        <img src="assets/img/portfolio/apache-nifi-processes-creation.png" alt="Image Description">

                        <li><b>Step 3: Configure GetFile Processor</b></li>
                        <p>Select the "GetFile" processor from the list of available processors and drag it onto your canvas.
                          This processor will retrieve the CSV files from the local directory. Configure it as follows:</p>
                          <img src="assets/img/portfolio/getfile.jpg" alt="Image Description">
                          <ul>
                              <li>Specify the input directory where your JupyterLab generated CSV files are located (e.g., "FakeDataset").</li>
                              <li>Set a polling schedule to check for new files at regular intervals.</li>
                              <li>You have to provide few of these from the table, inside the configuration processor.</li>
                              <li>(<a href="https://nifi.apache.org/docs/nifi-docs/components/org.apache.nifi/nifi-standard-nar/1.23.2/org.apache.nifi.processors.standard.GetFile/index.html" target="_blank">More information</a>)</li>
                          </ul>
                          <table>
                            <tr>
                              <th>Display Name</th>
                              <th>API Name</th>
                              <th>Default Value</th>
                              <th>Allowable Values</th>
                              <th>Description</th>
                            </tr>
                            <tr>
                              <td>Input Directory</td>
                              <td>Input Directory</td>
                              <td></td>
                              <td></td>
                              <td>The input directory from which to pull files</td>
                            </tr>
                            <tr>
                              <td>File Filter</td>
                              <td>File Filter</td>
                              <td>[^\.].*</td>
                              <td></td>
                              <td>Only files whose names match the given regular expression will be picked up</td>
                            </tr>
                            <tr>
                              <td>Path Filter</td>
                              <td>Path Filter</td>
                              <td></td>
                              <td></td>
                              <td>When Recurse Subdirectories is true, then only subdirectories whose path matches the given regular expression will be scanned</td>
                            </tr>
                            <tr>
                              <td>Batch Size</td>
                              <td>Batch Size</td>
                              <td>10</td>
                              <td></td>
                              <td>The maximum number of files to pull in each iteration</td>
                            </tr>
                            <tr>
                              <td>Keep Source File</td>
                              <td>Keep Source File</td>
                              <td>false</td>
                              <td>True, False</td>
                              <td>If true, the file is not deleted after it has been copied to the Content Repository; this causes the file to be picked up continually 
                                and is useful for testing purposes. If not keeping original NiFi will need write permissions on the directory it is pulling from otherwise 
                                it will ignore the file.</td>
                            </tr>
                            <tr>
                              <td>Recurse Subdirectories</td>
                              <td>Recurse Subdirectories</td>
                              <td>true</td>
                              <td>True, False</td>
                              <td>Indicates whether or not to pull files from subdirectories</td>
                            </tr>
                            <tr>
                              <td>Polling Interval</td>
                              <td>Polling Interval</td>
                              <td>0 sec</td>
                              <td></td>
                              <td>Indicates how long to wait before performing a directory listing</td>
                            </tr>
                            <tr>
                              <td>Ignore Hidden Files</td>
                              <td>Ignore Hidden Files</td>
                              <td>true</td>
                              <td>True, False</td>
                              <td>Indicates whether or not hidden files should be ignored</td>
                            </tr>
                            <tr>
                              <td>Minimum File Age</td>
                              <td>Minimum File Age</td>
                              <td>0 sec</td>
                              <td></td>
                              <td>The minimum age that a file must be in order to be pulled; any file younger than this amount of time (according to last modification date) will be ignored</td>
                            </tr>
                            <tr>
                              <td>Maximum File Age</td>
                              <td>Maximum File Age</td>
                              <td></td>
                              <td></td>
                              <td>The maximum age that a file must be in order to be pulled; any file older than this amount of time (according to last modification date) will be ignored</td>
                            </tr>
                            <tr>
                              <td>Minimum File Size</td>
                              <td>Minimum File Size</td>
                              <td>0 B</td>
                              <td></td>
                              <td>The minimum size that a file must be in order to be pulled</td>
                            </tr>
                            <tr>
                              <td>Maximum File Size</td>
                              <td>Maximum File Size</td>
                              <td></td>
                              <td></td>
                              <td>The maximum size that a file can be in order to be pulled</td>
                            </tr>
                          </table>
                          
                        

                        <li><b>Step 4: Configure PutS3Object Processor</b></li>
                        <p>Add a "PutS3Object" processor to your flow. This processor will upload the fetched CSV files to your S3 bucket. Configure it as follows:</p>
                        <p>In the 'Put S3 Object' section of your pipeline configuration, you will need to configure an AWS credentials provider.</p>
                        <img src="assets/img/portfolio/PutS3Object.png" alt="Image Description" style="width: 450px; height: 200px;">
                          <ul>
                            <li>Set the AWS credentials, including your access key and secret key.</li>
                            <li>Specify the S3 bucket name where you want to upload the files.</li>
                            <li>Configure the destination directory within the S3 bucket.</li>
                            <li>Map the incoming flow file attributes to the S3 object properties.</li>
                            <li>(<a href="https://nifi.apache.org/docs/nifi-docs/components/org.apache.nifi/nifi-aws-nar/1.23.2/org.apache.nifi.processors.aws.s3.PutS3Object/index.html" target="_blank">More infromation</a>)</li>
                          </ul>

                          <table>
                            <tr>
                              <th>Display Name</th>
                              <th>API Name</th>
                              <th>Default Value</th>
                              <th>Allowable Values</th>
                              <th>Description</th>
                            </tr>
                            <tr>
                              <td>Object Key</td>
                              <td>Object Key</td>
                              <td>${filename}</td>
                              <td></td>
                              <td>The S3 Object Key to use. This is analogous to a filename for traditional file systems.</td>
                            </tr>
                            <tr>
                              <td>Bucket</td>
                              <td>Bucket</td>
                              <td></td>
                              <td></td>
                              <td>The S3 Bucket to interact with.</td>
                            </tr>
                            <tr>
                              <td>Content Type</td>
                              <td>Content Type</td>
                              <td></td>
                              <td></td>
                              <td>Sets the Content-Type HTTP header indicating the type of content stored in the associated object. The value of this header is a standard MIME type. AWS S3 Java client will attempt to determine the correct content type if one hasn't been set yet. Users are responsible for ensuring a suitable content type is set when uploading streams. If no content type is provided and cannot be determined by the filename, the default content type "application/octet-stream" will be used.
                              Supports Expression Language: true (will be evaluated using flow file attributes and variable registry)</td>
                            </tr>
                            <tr>
                              <td>Content Disposition</td>
                              <td>Content Disposition</td>
                              <td></td>
                              <td>inline, attachment</td>
                              <td>Sets the Content-Disposition HTTP header indicating if the content is intended to be displayed inline or should be downloaded. Possible values are 'inline' or 'attachment'. If this property is not specified, object's content-disposition will be set to filename. When 'attachment' is selected, '; filename=' plus object key are automatically appended to form the final value 'attachment; filename="filename.jpg"'</td>
                            </tr>
                            <tr>
                              <td>Cache Control</td>
                              <td>Cache Control</td>
                              <td></td>
                              <td></td>
                              <td>Sets the Cache-Control HTTP header indicating the caching directives of the associated object. Multiple directives are comma-separated.
                              Supports Expression Language: true (will be evaluated using flow file attributes and variable registry)</td>
                            </tr>
                            <tr>
                              <td>Access Key ID</td>
                              <td>Access Key</td>
                              <td></td>
                              <td></td>
                              <td>No Description Provided.
                              Sensitive Property: true
                              Supports Expression Language: true (will be evaluated using variable registry only)</td>
                            </tr>
                            <tr>
                              <td>Secret Access Key</td>
                              <td>Secret Key</td>
                              <td></td>
                              <td></td>
                              <td>No Description Provided.
                              Sensitive Property: true
                              Supports Expression Language: true (will be evaluated using variable registry only)</td>
                            </tr>
                            <tr>
                              <td>Credentials File</td>
                              <td>Credentials File</td>
                              <td></td>
                              <td></td>
                              <td>Path to a file containing AWS access key and secret key in properties file format.
                              This property requires exactly one file to be provided.</td>
                            </tr>
                            <tr>
                              <td>AWS Credentials Provider Service</td>
                              <td>AWS Credentials Provider service</td>
                              <td></td>
                              <td>Controller Service API: AWSCredentialsProviderService
                              Implementation: AWSCredentialsProviderControllerService</td>
                              <td>The Controller Service that is used to obtain AWS credentials provider</td>
                            </tr>
                          </table>
                          

                        <li><b>Step 5: Connect Processors:</b></li>
                          <p>Connect the "GetFile" processor to the "PutS3Object" processor using relationships. Ensure that the data flow follows the intended sequence.</p>

                        <li><b>Step 6: Start the Flow</b></li>
                        <p>Start the processor group to activate the flow. NiFi will begin monitoring the directory for new files and uploading them to the S3 bucket as they arrive.</p>

                        <li><b>Step 6: Monitor and Troubleshoot</b></li>
                        <p>You can monitor the status of the processors and view logs in the NiFi UI to ensure that the data flow is working correctly. Additionally, inspect 
                          the S3 bucket to confirm that the CSV files are being uploaded as expected.</p>
                    </ul>

                    <p>Your Apache NiFi data flow is now set up to ingest CSV files generated in JupyterLab and store them in your AWS S3 bucket as they are created. 
                      Make sure to configure and test it properly to suit your specific requirements.</p>

                    <p></p><p></p>















                
            </div>
            <a href="portfolio-details-8.html" class="clickable-box">Go back</a>
        </div>
    </section><!-- End Portfolio Details Section -->
</main><!-- End #main -->

<!-- ======= Footer ======= -->
<footer id="footer">
  <div class="container">
    <div class="copyright">
      &copy; Copyright <strong><span>Arun</span></strong>
    </div>
  </div>
</footer><!-- End  Footer -->

<a href="#" class="back-to-top d-flex align-items-center justify-content-center"><i class="bi bi-arrow-up-short"></i></a>

<!-- Vendor JS Files -->
<script src="assets/vendor/purecounter/purecounter_vanilla.js"></script>
<script src="assets/vendor/aos/aos.js"></script>
<script src="assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
<script src="assets/vendor/glightbox/js/glightbox.min.js"></script>
<script src="assets/vendor/isotope-layout/isotope.pkgd.min.js"></script>
<script src="assets/vendor/swiper/swiper-bundle.min.js"></script>
<script src="assets/vendor/typed.js/typed.umd.js"></script>
<script src="assets/vendor/waypoints/noframework.waypoints.js"></script>
<script src="assets/vendor/php-email-form/validate.js"></script>

<!-- Template Main JS File -->
<script src="assets/js/main.js"></script>

<script>
  document.addEventListener("DOMContentLoaded", function () {
    hljs.initHighlightingOnLoad();
  });
</script>

</body>

</html>