diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 15df70bb..66232928 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -6,8 +6,8 @@
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._event import EventOptions
 from cuda.core.experimental._launcher import LaunchConfig, launch
+from cuda.core.experimental._program import Program, ProgramOptions
 from cuda.core.experimental._linker import Linker, LinkerOptions
-from cuda.core.experimental._program import Program
 from cuda.core.experimental._stream import Stream, StreamOptions
 from cuda.core.experimental._system import System
 
diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 6e36a2a5..54b14395 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -10,6 +10,7 @@
 from typing import List, Optional
 
 from cuda import cuda
+from cuda.core.experimental._device import Device
 from cuda.core.experimental._module import ObjectCode
 from cuda.core.experimental._utils import check_or_create_options, handle_return
 
@@ -92,10 +93,10 @@ class LinkerOptions:
 
     Attributes
     ----------
-    arch : str
-        Pass the SM architecture value, such as ``-arch=sm_<CC>`` (for generating CUBIN) or
-        ``compute_<CC>`` (for generating PTX).
-        This is a required option.
+    arch : str, optional
+        Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
+        ``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
+        will be used.
     max_register_count : int, optional
         Maximum register count.
         Maps to: ``-maxrregcount=<N>``.
@@ -173,7 +174,7 @@ class LinkerOptions:
         Default: False.
     """
 
-    arch: str
+    arch: Optional[str] = None
     max_register_count: Optional[int] = None
     time: Optional[bool] = None
     verbose: Optional[bool] = None
@@ -205,6 +206,8 @@ def __post_init__(self):
     def _init_nvjitlink(self):
         if self.arch is not None:
             self.formatted_options.append(f"-arch={self.arch}")
+        else:
+            self.formatted_options.append("-arch=sm_" + "".join(f"{i}" for i in Device().compute_capability))
         if self.max_register_count is not None:
             self.formatted_options.append(f"-maxrregcount={self.max_register_count}")
         if self.time is not None:
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 0dac79eb..789ef539 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -3,10 +3,365 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import weakref
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
 
 from cuda import nvrtc
+from cuda.core.experimental._device import Device
 from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils import handle_return
+from cuda.core.experimental._utils import (
+    _handle_boolean_option,
+    check_or_create_options,
+    handle_return,
+    is_list_or_tuple,
+    is_nested_list_or_tuple,
+)
+
+
+@dataclass
+class ProgramOptions:
+    """Customizable options for configuring `Program`.
+
+    Attributes
+    ----------
+    arch : str, optional
+        Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
+        ``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
+        will be used.
+    relocatable_device_code : bool, optional
+        Enable (disable) the generation of relocatable device code.
+        Default: False
+        Maps to: ``--relocatable-device-code={true|false}`` (``-rdc``)
+    extensible_whole_program : bool, optional
+        Do extensible whole program compilation of device code.
+        Default: False
+        Maps to: ``--extensible-whole-program`` (``-ewp``)
+    debug : bool, optional
+        Generate debug information. If --dopt is not specified, then turns off all optimizations.
+        Default: False
+        Maps to: ``--device-debug`` (``-G``)
+    lineinfo: bool, optional
+        Generate line-number information.
+        Default: False
+        Maps to: ``--generate-line-info`` (``-lineinfo``)
+    device_code_optimize : bool, optional
+        Enable device code optimization. When specified along with ‘-G’, enables limited debug information generation
+        for optimized device code.
+        Default: None
+        Maps to: ``--dopt on`` (``-dopt``)
+    ptxas_options : Union[str, List[str]], optional
+        Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings.
+        For example ["-v", "-O2"].
+        Default: None
+        Maps to: ``--ptxas-options <options>`` (``-Xptxas``)
+    max_register_count : int, optional
+        Specify the maximum amount of registers that GPU functions can use.
+        Default: None
+        Maps to: ``--maxrregcount=<N>`` (``-maxrregcount``)
+    ftz : bool, optional
+        When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal
+        values.
+        Default: False
+        Maps to: ``--ftz={true|false}`` (``-ftz``)
+    prec_sqrt : bool, optional
+        For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation.
+        Default: True
+        Maps to: ``--prec-sqrt={true|false}`` (``-prec-sqrt``)
+    prec_div : bool, optional
+        For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster
+        approximation.
+        Default: True
+        Maps to: ``--prec-div={true|false}`` (``-prec-div``)
+    fma : bool, optional
+        Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point
+        multiply-add operations.
+        Default: True
+        Maps to: ``--fmad={true|false}`` (``-fmad``)
+    use_fast_math : bool, optional
+        Make use of fast math operations.
+        Default: False
+        Maps to: ``--use_fast_math`` (``-use_fast_math``)
+    extra_device_vectorization : bool, optional
+        Enables more aggressive device code vectorization in the NVVM optimizer.
+        Default: False
+        Maps to: ``--extra-device-vectorization`` (``-extra-device-vectorization``)
+    link_time_optimization : bool, optional
+        Generate intermediate code for later link-time optimization.
+        Default: False
+        Maps to: ``--dlink-time-opt`` (``-dlto``)
+    gen_opt_lto : bool, optional
+        Run the optimizer passes before generating the LTO IR.
+        Default: False
+        Maps to: ``--gen-opt-lto`` (``-gen-opt-lto``)
+    define_macro : Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]], optional
+        Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of
+        strings, in which case the first element is defined as the second, or a list of strings or tuples.
+        Default: None
+        Maps to: ``--define-macro=<def>`` (``-D``)
+    undefine_macro : Union[str, List[str]], optional
+        Cancel any previous definition of a macro, or list of macros.
+        Default: None
+        Maps to: ``--undefine-macro=<def>`` (``-U``)
+    include_path : Union[str, List[str]], optional
+        Add the directory or directories to the list of directories to be searched for headers.
+        Default: None
+        Maps to: ``--include-path=<dir>`` (``-I``)
+    pre_include : Union[str, List[str]], optional
+        Preinclude one or more headers during preprocessing. Can be either a string or a list of strings.
+        Default: None
+        Maps to: ``--pre-include=<header>`` (``-include``)
+    no_source_include : bool, optional
+        Disable the default behavior of adding the directory of each input source to the include path.
+        Default: False
+        Maps to: ``--no-source-include`` (``-no-source-include``)
+    std : str, optional
+        Set language dialect to C++03, C++11, C++14, C++17 or C++20.
+        Default: c++17
+        Maps to: ``--std={c++03|c++11|c++14|c++17|c++20}`` (``-std``)
+    builtin_move_forward : bool, optional
+        Provide builtin definitions of std::move and std::forward.
+        Default: True
+        Maps to: ``--builtin-move-forward={true|false}`` (``-builtin-move-forward``)
+    builtin_initializer_list : bool, optional
+        Provide builtin definitions of std::initializer_list class and member functions.
+        Default: True
+        Maps to: ``--builtin-initializer-list={true|false}`` (``-builtin-initializer-list``)
+    disable_warnings : bool, optional
+        Inhibit all warning messages.
+        Default: False
+        Maps to: ``--disable-warnings`` (``-w``)
+    restrict : bool, optional
+        Programmer assertion that all kernel pointer parameters are restrict pointers.
+        Default: False
+        Maps to: ``--restrict`` (``-restrict``)
+    device_as_default_execution_space : bool, optional
+        Treat entities with no execution space annotation as __device__ entities.
+        Default: False
+        Maps to: ``--device-as-default-execution-space`` (``-default-device``)
+    device_int128 : bool, optional
+        Allow the __int128 type in device code.
+        Default: False
+        Maps to: ``--device-int128`` (``-device-int128``)
+    optimization_info : str, optional
+        Provide optimization reports for the specified kind of optimization.
+        Default: None
+        Maps to: ``--optimization-info=<kind>`` (``-opt-info``)
+    no_display_error_number : bool, optional
+        Disable the display of a diagnostic number for warning messages.
+        Default: False
+        Maps to: ``--no-display-error-number`` (``-no-err-no``)
+    diag_error : Union[int, List[int]], optional
+        Emit error for a specified diagnostic message number or comma separated list of numbers.
+        Default: None
+        Maps to: ``--diag-error=<error-number>, ...`` (``-diag-error``)
+    diag_suppress : Union[int, List[int]], optional
+        Suppress a specified diagnostic message number or comma separated list of numbers.
+        Default: None
+        Maps to: ``--diag-suppress=<error-number>,…`` (``-diag-suppress``)
+    diag_warn : Union[int, List[int]], optional
+        Emit warning for a specified diagnostic message number or comma separated lis of numbers.
+        Default: None
+        Maps to: ``--diag-warn=<error-number>,…`` (``-diag-warn``)
+    brief_diagnostics : bool, optional
+        Disable or enable showing source line and column info in a diagnostic.
+        Default: False
+        Maps to: ``--brief-diagnostics={true|false}`` (``-brief-diag``)
+    time : str, optional
+        Generate a CSV table with the time taken by each compilation phase.
+        Default: None
+        Maps to: ``--time=<file-name>`` (``-time``)
+    split_compile : int, optional
+        Perform compiler optimizations in parallel.
+        Default: 1
+        Maps to: ``--split-compile= <number of threads>`` (``-split-compile``)
+    fdevice_syntax_only : bool, optional
+        Ends device compilation after front-end syntax checking.
+        Default: False
+        Maps to: ``--fdevice-syntax-only`` (``-fdevice-syntax-only``)
+    minimal : bool, optional
+        Omit certain language features to reduce compile time for small programs.
+        Default: False
+        Maps to: ``--minimal`` (``-minimal``)
+    """
+
+    arch: Optional[str] = None
+    relocatable_device_code: Optional[bool] = None
+    extensible_whole_program: Optional[bool] = None
+    debug: Optional[bool] = None
+    lineinfo: Optional[bool] = None
+    device_code_optimize: Optional[bool] = None
+    ptxas_options: Optional[Union[str, List[str], Tuple[str]]] = None
+    max_register_count: Optional[int] = None
+    ftz: Optional[bool] = None
+    prec_sqrt: Optional[bool] = None
+    prec_div: Optional[bool] = None
+    fma: Optional[bool] = None
+    use_fast_math: Optional[bool] = None
+    extra_device_vectorization: Optional[bool] = None
+    link_time_optimization: Optional[bool] = None
+    gen_opt_lto: Optional[bool] = None
+    define_macro: Optional[
+        Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]], Tuple[Union[str, Tuple[str, str]]]]
+    ] = None
+    undefine_macro: Optional[Union[str, List[str], Tuple[str]]] = None
+    include_path: Optional[Union[str, List[str], Tuple[str]]] = None
+    pre_include: Optional[Union[str, List[str], Tuple[str]]] = None
+    no_source_include: Optional[bool] = None
+    std: Optional[str] = None
+    builtin_move_forward: Optional[bool] = None
+    builtin_initializer_list: Optional[bool] = None
+    disable_warnings: Optional[bool] = None
+    restrict: Optional[bool] = None
+    device_as_default_execution_space: Optional[bool] = None
+    device_int128: Optional[bool] = None
+    optimization_info: Optional[str] = None
+    no_display_error_number: Optional[bool] = None
+    diag_error: Optional[Union[int, List[int], Tuple[int]]] = None
+    diag_suppress: Optional[Union[int, List[int], Tuple[int]]] = None
+    diag_warn: Optional[Union[int, List[int], Tuple[int]]] = None
+    brief_diagnostics: Optional[bool] = None
+    time: Optional[str] = None
+    split_compile: Optional[int] = None
+    fdevice_syntax_only: Optional[bool] = None
+    minimal: Optional[bool] = None
+
+    def __post_init__(self):
+        self._formatted_options = []
+        if self.arch is not None:
+            self._formatted_options.append(f"--gpu-architecture={self.arch}")
+        else:
+            self._formatted_options.append(
+                "--gpu-architecture=sm_" + "".join(f"{i}" for i in Device().compute_capability)
+            )
+        if self.relocatable_device_code is not None:
+            self._formatted_options.append(
+                f"--relocatable-device-code={_handle_boolean_option(self.relocatable_device_code)}"
+            )
+        if self.extensible_whole_program is not None and self.extensible_whole_program:
+            self._formatted_options.append("--extensible-whole-program")
+        if self.debug is not None and self.debug:
+            self._formatted_options.append("--device-debug")
+        if self.lineinfo is not None and self.lineinfo:
+            self._formatted_options.append("--generate-line-info")
+        if self.device_code_optimize is not None:
+            self._formatted_options.append(f"--dopt={'on' if self.device_code_optimize else 'off'}")
+        if self.ptxas_options is not None:
+            self._formatted_options.append("--ptxas-options")
+            if is_list_or_tuple(self.ptxas_options):
+                for option in self.ptxas_options:
+                    self._formatted_options.append(option)
+            else:
+                self._formatted_options.append(self.ptxas_options)
+        if self.max_register_count is not None:
+            self._formatted_options.append(f"--maxrregcount={self.max_register_count}")
+        if self.ftz is not None:
+            self._formatted_options.append(f"--ftz={_handle_boolean_option(self.ftz)}")
+        if self.prec_sqrt is not None:
+            self._formatted_options.append(f"--prec-sqrt={_handle_boolean_option(self.prec_sqrt)}")
+        if self.prec_div is not None:
+            self._formatted_options.append(f"--prec-div={_handle_boolean_option(self.prec_div)}")
+        if self.fma is not None:
+            self._formatted_options.append(f"--fmad={_handle_boolean_option(self.fma)}")
+        if self.use_fast_math is not None and self.use_fast_math:
+            self._formatted_options.append("--use_fast_math")
+        if self.extra_device_vectorization is not None and self.extra_device_vectorization:
+            self._formatted_options.append("--extra-device-vectorization")
+        if self.link_time_optimization is not None and self.link_time_optimization:
+            self._formatted_options.append("--dlink-time-opt")
+        if self.gen_opt_lto is not None and self.gen_opt_lto:
+            self._formatted_options.append("--gen-opt-lto")
+        if self.define_macro is not None:
+            if is_nested_list_or_tuple(self.define_macro):
+                for macro in self.define_macro:
+                    if isinstance(macro, tuple):
+                        assert len(macro) == 2
+                        self._formatted_options.append(f"--define-macro={macro[0]}={macro[1]}")
+                    else:
+                        self._formatted_options.append(f"--define-macro={macro}")
+            elif isinstance(self.define_macro, tuple):
+                assert len(self.define_macro) == 2
+                self._formatted_options.append("--define-macro=MY_MACRO=999")
+            else:
+                self._formatted_options.append(f"--define-macro={self.define_macro}")
+
+        if self.undefine_macro is not None:
+            if is_list_or_tuple(self.undefine_macro):
+                for macro in self.undefine_macro:
+                    self._formatted_options.append(f"--undefine-macro={macro}")
+            else:
+                self._formatted_options.append(f"--undefine-macro={self.undefine_macro}")
+        if self.include_path is not None:
+            if is_list_or_tuple(self.include_path):
+                for path in self.include_path:
+                    self._formatted_options.append(f"--include-path={path}")
+            else:
+                self._formatted_options.append(f"--include-path={self.include_path}")
+        if self.pre_include is not None:
+            if is_list_or_tuple(self.pre_include):
+                for header in self.pre_include:
+                    self._formatted_options.append(f"--pre-include={header}")
+            else:
+                self._formatted_options.append(f"--pre-include={self.pre_include}")
+        if self.no_source_include is not None and self.no_source_include:
+            self._formatted_options.append("--no-source-include")
+        if self.std is not None:
+            self._formatted_options.append(f"--std={self.std}")
+        if self.builtin_move_forward is not None:
+            self._formatted_options.append(
+                f"--builtin-move-forward={_handle_boolean_option(self.builtin_move_forward)}"
+            )
+        if self.builtin_initializer_list is not None:
+            self._formatted_options.append(
+                f"--builtin-initializer-list={_handle_boolean_option(self.builtin_initializer_list)}"
+            )
+        if self.disable_warnings is not None and self.disable_warnings:
+            self._formatted_options.append("--disable-warnings")
+        if self.restrict is not None and self.restrict:
+            self._formatted_options.append("--restrict")
+        if self.device_as_default_execution_space is not None and self.device_as_default_execution_space:
+            self._formatted_options.append("--device-as-default-execution-space")
+        if self.device_int128 is not None and self.device_int128:
+            self._formatted_options.append("--device-int128")
+        if self.optimization_info is not None:
+            self._formatted_options.append(f"--optimization-info={self.optimization_info}")
+        if self.no_display_error_number is not None and self.no_display_error_number:
+            self._formatted_options.append("--no-display-error-number")
+        if self.diag_error is not None:
+            if is_list_or_tuple(self.diag_error):
+                for error in self.diag_error:
+                    self._formatted_options.append(f"--diag-error={error}")
+            else:
+                self._formatted_options.append(f"--diag-error={self.diag_error}")
+        if self.diag_suppress is not None:
+            if is_list_or_tuple(self.diag_suppress):
+                for suppress in self.diag_suppress:
+                    self._formatted_options.append(f"--diag-suppress={suppress}")
+            else:
+                self._formatted_options.append(f"--diag-suppress={self.diag_suppress}")
+        if self.diag_warn is not None:
+            if is_list_or_tuple(self.diag_warn):
+                for warn in self.diag_warn:
+                    self._formatted_options.append(f"--diag-warn={warn}")
+            else:
+                self._formatted_options.append(f"--diag-warn={self.diag_warn}")
+        if self.brief_diagnostics is not None:
+            self._formatted_options.append(f"--brief-diagnostics={_handle_boolean_option(self.brief_diagnostics)}")
+        if self.time is not None:
+            self._formatted_options.append(f"--time={self.time}")
+        if self.split_compile is not None:
+            self._formatted_options.append(f"--split-compile={self.split_compile}")
+        if self.fdevice_syntax_only is not None and self.fdevice_syntax_only:
+            self._formatted_options.append("--fdevice-syntax-only")
+        if self.minimal is not None and self.minimal:
+            self._formatted_options.append("--minimal")
+
+    def _as_bytes(self):
+        # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
+        return list(o.encode() for o in self._formatted_options)
+
+    def __repr__(self):
+        # __TODO__ improve this
+        return self._formatted_options
 
 
 class Program:
@@ -23,7 +378,9 @@ class Program:
         String of the CUDA Runtime Compilation program.
     code_type : Any
         String of the code type. Currently only ``"c++"`` is supported.
-
+    options : ProgramOptions, optional
+        A ProgramOptions object to customize the compilation process.
+        See :obj:`ProgramOptions` for more information.
     """
 
     class _MembersNeededForFinalize:
@@ -38,11 +395,11 @@ def close(self):
                 handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
                 self.handle = None
 
-    __slots__ = ("__weakref__", "_mnff", "_backend")
+    __slots__ = ("__weakref__", "_mnff", "_backend", "_options")
     _supported_code_type = ("c++",)
     _supported_target_type = ("ptx", "cubin", "ltoir")
 
-    def __init__(self, code, code_type):
+    def __init__(self, code, code_type, options: ProgramOptions = None):
         self._mnff = Program._MembersNeededForFinalize(self, None)
 
         if code_type not in self._supported_code_type:
@@ -58,11 +415,13 @@ def __init__(self, code, code_type):
         else:
             raise NotImplementedError
 
+        self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
+
     def close(self):
         """Destroy this program."""
         self._mnff.close()
 
-    def compile(self, target_type, options=(), name_expressions=(), logs=None):
+    def compile(self, target_type, name_expressions=(), logs=None):
         """Compile the program with a specific compilation type.
 
         Parameters
@@ -70,9 +429,6 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None):
         target_type : Any
             String of the targeted compilation type.
             Supported options are "ptx", "cubin" and "ltoir".
-        options : Union[List, Tuple], optional
-            List of compilation options associated with the backend
-            of this :obj:`~_program.Program`. (Default to no options)
         name_expressions : Union[List, Tuple], optional
             List of explicit name expressions to become accessible.
             (Default to no expressions)
@@ -94,9 +450,11 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None):
             if name_expressions:
                 for n in name_expressions:
                     handle_return(nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), handle=self._mnff.handle)
-            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-            options = list(o.encode() for o in options)
-            handle_return(nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options), handle=self._mnff.handle)
+            options = self._options._as_bytes()
+            handle_return(
+                nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options),
+                handle=self._mnff.handle,
+            )
 
             size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size")
             comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}")
diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py
index 9cb47a33..9e64311b 100644
--- a/cuda_core/cuda/core/experimental/_utils.py
+++ b/cuda_core/cuda/core/experimental/_utils.py
@@ -87,6 +87,13 @@ def check_or_create_options(cls, options, options_description, *, keep_none=Fals
     return options
 
 
+def _handle_boolean_option(option: bool) -> str:
+    """
+    Convert a boolean option to a string representation.
+    """
+    return "true" if bool(option) else "false"
+
+
 def precondition(checker: Callable[..., None], what: str = "") -> Callable:
     """
     A decorator that adds checks to ensure any preconditions are met.
@@ -134,3 +141,17 @@ def get_device_from_ctx(ctx_handle) -> int:
         assert ctx_handle == handle_return(cuda.cuCtxPopCurrent())
         handle_return(cuda.cuCtxPushCurrent(prev_ctx))
     return device_id
+
+
+def is_list_or_tuple(obj):
+    """
+    Check if the given object is a sequence (list or tuple).
+    """
+    return isinstance(obj, (list, tuple))
+
+
+def is_nested_list_or_tuple(obj):
+    """
+    Check if the given object is a nested sequence (list or tuple with atleast one list or tuple element).
+    """
+    return is_list_or_tuple(obj) and any(is_list_or_tuple(elem) for elem in obj)
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 4b30c6ef..5239174f 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -35,6 +35,7 @@ CUDA compilation toolchain
 
    :template: dataclass.rst
 
+   ProgramOptions
    LinkerOptions
 
 
diff --git a/cuda_core/docs/source/index.rst b/cuda_core/docs/source/index.rst
index 19c14c2a..1aedfaa8 100644
--- a/cuda_core/docs/source/index.rst
+++ b/cuda_core/docs/source/index.rst
@@ -8,7 +8,7 @@ and other functionalities.
    :maxdepth: 2
    :caption: Contents:
 
-   release.md
+   release.rst
    install.md
    interoperability.rst
    api.rst
diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md
deleted file mode 100644
index a9e16d6e..00000000
--- a/cuda_core/docs/source/release.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# Release Notes
-
-```{toctree}
----
-maxdepth: 3
----
-
-    0.1.1 <release/0.1.1-notes>
-    0.1.0 <release/0.1.0-notes>
-
-```
diff --git a/cuda_core/docs/source/release.rst b/cuda_core/docs/source/release.rst
new file mode 100644
index 00000000..4e74423f
--- /dev/null
+++ b/cuda_core/docs/source/release.rst
@@ -0,0 +1,9 @@
+Release Notes
+=============
+
+.. toctree::
+   :maxdepth: 3
+
+   release/0.2.0-notes
+   release/0.1.1-notes
+   release/0.1.0-notes
diff --git a/cuda_core/docs/source/release/0.1.0-notes.md b/cuda_core/docs/source/release/0.1.0-notes.rst
similarity index 57%
rename from cuda_core/docs/source/release/0.1.0-notes.md
rename to cuda_core/docs/source/release/0.1.0-notes.rst
index 5d1c7fd5..bc327971 100644
--- a/cuda_core/docs/source/release/0.1.0-notes.md
+++ b/cuda_core/docs/source/release/0.1.0-notes.rst
@@ -1,17 +1,21 @@
-# `cuda.core` v0.1.0 Release notes
+``cuda.core`` 0.1.0 Release Notes
+=================================
 
 Released on Nov 8, 2024
 
-## Hightlights
+Highlights
+----------
+
 - Initial beta release
 - Supports all platforms that CUDA is supported
 - Supports all CUDA 11.x/12.x drivers
 - Supports all CUDA 11.x/12.x Toolkits
 - Pythonic CUDA runtime and other core functionalities
 
-## Limitations
+Limitations
+-----------
 
 - All APIs are currently *experimental* and subject to change without deprecation notice.
-  Please kindly share your feedbacks with us so that we can make `cuda.core` better!
+  Please kindly share your feedback with us so that we can make ``cuda.core`` better!
 - Source code release only; `pip`/`conda` support is coming in a future release
-- Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206)
+- Windows TCC mode is `not yet supported <https://github.com/NVIDIA/cuda-python/issues/206>`_
diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md
deleted file mode 100644
index 9550cb23..00000000
--- a/cuda_core/docs/source/release/0.1.1-notes.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# `cuda.core` v0.1.1 Release notes
-
-Released on Dec 20, 2024
-
-## Hightlights
-
-- Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete
-  implementation of DLPack & CUDA Array Interface supports.
-- Add `Linker` that can link one or multiple `ObjectCode` instances generated by `Program`. Under
-  the hood, it uses either the nvJitLink or driver (`cuLink*`) APIs depending on the CUDA version
-  detected in the current environment.
-- Support `pip install cuda-core`. Please see the Installation Guide for further details.
-
-## New features
-
-- Add a `cuda.core.experimental.system` module for querying system- or process- wide information.
-- Add `LaunchConfig.cluster` to support thread block clusters on Hopper GPUs.
-
-## Enhancements
-
-- The internal handle held by `ObjectCode` is now lazily initialized upon first touch.
-- Support TCC devices with a default synchronous memory resource to avoid the use of memory pools.
-- Ensure `"ltoir"` is a valid code type to `ObjectCode`.
-- Document the `__cuda_stream__` protocol.
-- Improve test coverage & documentation cross-references.
-- Enforce code formatting.
-
-## Bug fixes
-
-- Eliminate potential class destruction issues.
-- Fix circular import during handling a foreign CUDA stream.
-
-## Limitations
-
-- All APIs are currently *experimental* and subject to change without deprecation notice.
-  Please kindly share your feedbacks with us so that we can make `cuda.core` better!
-- Using `cuda.core` with NVRTC or nvJitLink installed from PyPI via `pip install` is currently
-  not supported. This will be fixed in a future release.
-- Some `LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12,
-  the backend is the cuLink api which supports only a subset of the options that nvjitlink does.
-  Further, some options aren't available on CUDA versions <12.6.
-- To use `cuda.core` with Python 3.13, it currently requires building `cuda-python` from source
-  prior to `pip install`. This extra step will be fixed soon.
diff --git a/cuda_core/docs/source/release/0.1.1-notes.rst b/cuda_core/docs/source/release/0.1.1-notes.rst
new file mode 100644
index 00000000..68cca671
--- /dev/null
+++ b/cuda_core/docs/source/release/0.1.1-notes.rst
@@ -0,0 +1,51 @@
+.. currentmodule:: cuda.core.experimental
+  
+``cuda.core`` 0.1.1 Release Notes
+=================================
+
+Released on Dec 20, 2024
+
+Highlights
+----------
+
+- Add :obj:`~utils.StridedMemoryView` and :func:`~utils.args_viewable_as_strided_memory` that provide a concrete
+  implementation of DLPack & CUDA Array Interface supports.
+- Add :obj:`~Linker` that can link one or multiple :obj:`~_module.ObjectCode` instances generated by :obj:`~Program`. Under
+  the hood, it uses either the nvJitLink or driver (``cuLink*``) APIs depending on the CUDA version
+  detected in the current environment.
+- Support ``pip install cuda-core``. Please see the Installation Guide for further details.
+
+New features
+------------
+
+- Add a :obj:`cuda.core.experiemental.system` module for querying system- or process-wide information.
+- Add :obj:`~LaunchConfig.cluster` to support thread block clusters on Hopper GPUs.
+
+Enhancements
+------------
+
+- The internal handle held by :obj:`~_module.ObjectCode` is now lazily initialized upon first touch.
+- Support TCC devices with a default synchronous memory resource to avoid the use of memory pools.
+- Ensure ``"ltoir"`` is a valid code type to :obj:`~_module.ObjectCode`.
+- Document the ``__cuda_stream__`` protocol.
+- Improve test coverage & documentation cross-references.
+- Enforce code formatting.
+
+Bug fixes
+---------
+
+- Eliminate potential class destruction issues.
+- Fix circular import during handling a foreign CUDA stream.
+
+Limitations
+-----------
+
+- All APIs are currently *experimental* and subject to change without deprecation notice.
+  Please kindly share your feedback with us so that we can make ``cuda.core`` better!
+- Using ``cuda.core`` with NVRTC or nvJitLink installed from PyPI via `pip install` is currently
+  not supported. This will be fixed in a future release.
+- Some :class:`~LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12,
+  the backend is the cuLink API which supports only a subset of the options that nvjitlink does.
+  Further, some options aren't available on CUDA versions <12.6.
+- To use ``cuda.core`` with Python 3.13, it currently requires building ``cuda-python`` from source
+  prior to `pip install`. This extra step will be fixed soon.
diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst
new file mode 100644
index 00000000..3a8b5a78
--- /dev/null
+++ b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -0,0 +1,21 @@
+.. currentmodule:: cuda.core.experimental
+
+``cuda.core`` 0.2.0 Release Notes
+=================================
+
+Released on <TODO>, 2024
+
+Highlights
+----------
+
+- Add :class:`~ProgramOptions` to facilitate the passing of runtime compile options to :obj:`~Program`.
+
+Limitations
+-----------
+
+- <TODO>
+
+Breaking Changes
+----------------
+
+- The :meth:`~Program.compile` method no longer accepts the `options` argument. Instead, you can optionally pass an instance of :class:`~ProgramOptions` to the constructor of :obj:`~Program`.
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
index d1d7211f..8fc4b91b 100644
--- a/cuda_core/examples/saxpy.py
+++ b/cuda_core/examples/saxpy.py
@@ -6,7 +6,7 @@
 
 import cupy as cp
 
-from cuda.core.experimental import Device, LaunchConfig, Program, launch
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # compute out = a * x + y
 code = """
@@ -29,13 +29,11 @@
 s = dev.create_stream()
 
 # prepare program
-prog = Program(code, code_type="c++")
+arch = "".join(f"{i}" for i in dev.compute_capability)
+program_options = ProgramOptions(std="c++11", arch=f"sm_{arch}")
+prog = Program(code, code_type="c++", options=program_options)
 mod = prog.compile(
     "cubin",
-    options=(
-        "-std=c++11",
-        "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),
-    ),
     logs=sys.stdout,
     name_expressions=("saxpy<float>", "saxpy<double>"),
 )
diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py
index 2cc25989..ed970d50 100644
--- a/cuda_core/examples/strided_memory_view.py
+++ b/cuda_core/examples/strided_memory_view.py
@@ -31,7 +31,7 @@
     cp = None
 import numpy as np
 
-from cuda.core.experimental import Device, LaunchConfig, Program, launch
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
 from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
 
 # ################################################################################
@@ -88,16 +88,13 @@
         }
     }
     """).substitute(func_sig=func_sig)
-    gpu_prog = Program(gpu_code, code_type="c++")
+
     # To know the GPU's compute capability, we need to identify which GPU to use.
     dev = Device(0)
     dev.set_current()
     arch = "".join(f"{i}" for i in dev.compute_capability)
-    mod = gpu_prog.compile(
-        target_type="cubin",
-        # TODO: update this after NVIDIA/cuda-python#237 is merged
-        options=(f"-arch=sm_{arch}", "-std=c++11"),
-    )
+    gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11"))
+    mod = gpu_prog.compile(target_type="cubin")
     gpu_ker = mod.get_kernel(func_name)
 
 # Now we are prepared to run the code from the user's perspective!
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
index fa70738d..dfbe71a5 100644
--- a/cuda_core/examples/thread_block_cluster.py
+++ b/cuda_core/examples/thread_block_cluster.py
@@ -5,7 +5,7 @@
 import os
 import sys
 
-from cuda.core.experimental import Device, LaunchConfig, Program, launch
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # prepare include
 cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
@@ -44,12 +44,12 @@
 
 # prepare program & compile kernel
 dev.set_current()
-prog = Program(code, code_type="c++")
-mod = prog.compile(
-    target_type="cubin",
-    # TODO: update this after NVIDIA/cuda-python#237 is merged
-    options=(f"-arch=sm_{arch}", "-std=c++17", f"-I{cuda_include_path}"),
+prog = Program(
+    code,
+    code_type="c++",
+    options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=cuda_include_path),
 )
+mod = prog.compile(target_type="cubin")
 ker = mod.get_kernel("check_cluster_info")
 
 # prepare launch config
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
index 17265327..5a42ab7b 100644
--- a/cuda_core/examples/vector_add.py
+++ b/cuda_core/examples/vector_add.py
@@ -4,7 +4,7 @@
 
 import cupy as cp
 
-from cuda.core.experimental import Device, LaunchConfig, Program, launch
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # compute c = a + b
 code = """
@@ -26,15 +26,9 @@
 s = dev.create_stream()
 
 # prepare program
-prog = Program(code, code_type="c++")
-mod = prog.compile(
-    "cubin",
-    options=(
-        "-std=c++17",
-        "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),
-    ),
-    name_expressions=("vector_add<float>",),
-)
+program_options = ProgramOptions(std="c++17", arch="sm_" + "".join(f"{i}" for i in dev.compute_capability))
+prog = Program(code, code_type="c++", options=program_options)
+mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
 
 # run in single precision
 ker = mod.get_kernel("vector_add<float>")
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 54cd8cf4..37424d7b 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -4,10 +4,10 @@
 
 import pytest
 
-from cuda.core.experimental import Linker, LinkerOptions, Program, _linker
+from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker
 from cuda.core.experimental._module import ObjectCode
 
-ARCH = "sm_80"  # use sm_80 for testing the oop nvJitLink wrapper
+ARCH = "sm_" + "".join(f"{i}" for i in Device().compute_capability)
 
 kernel_a = """
 extern __device__ int B();
@@ -18,35 +18,43 @@
 device_function_c = "__device__ int C(int a, int b) { return a + b; }"
 
 culink_backend = _linker._decide_nvjitlink_or_driver()
+if not culink_backend:
+    from cuda.bindings import nvjitlink
 
 
 @pytest.fixture(scope="function")
 def compile_ptx_functions(init_cuda):
     # Without -rdc (relocatable device code) option, the generated ptx will not included any unreferenced
     # device functions, causing the link to fail
-    object_code_a_ptx = Program(kernel_a, "c++").compile("ptx", options=("-rdc=true",))
-    object_code_b_ptx = Program(device_function_b, "c++").compile("ptx", options=("-rdc=true",))
-    object_code_c_ptx = Program(device_function_c, "c++").compile("ptx", options=("-rdc=true",))
+    object_code_a_ptx = Program(kernel_a, "c++", ProgramOptions(relocatable_device_code=True)).compile("ptx")
+    object_code_b_ptx = Program(device_function_b, "c++", ProgramOptions(relocatable_device_code=True)).compile("ptx")
+    object_code_c_ptx = Program(device_function_c, "c++", ProgramOptions(relocatable_device_code=True)).compile("ptx")
 
     return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx
 
 
 @pytest.fixture(scope="function")
 def compile_ltoir_functions(init_cuda):
-    object_code_a_ltoir = Program(kernel_a, "c++").compile("ltoir", options=("-dlto",))
-    object_code_b_ltoir = Program(device_function_b, "c++").compile("ltoir", options=("-dlto",))
-    object_code_c_ltoir = Program(device_function_c, "c++").compile("ltoir", options=("-dlto",))
+    object_code_a_ltoir = Program(kernel_a, "c++", ProgramOptions(link_time_optimization=True)).compile("ltoir")
+    object_code_b_ltoir = Program(device_function_b, "c++", ProgramOptions(link_time_optimization=True)).compile(
+        "ltoir"
+    )
+    object_code_c_ltoir = Program(device_function_c, "c++", ProgramOptions(link_time_optimization=True)).compile(
+        "ltoir"
+    )
 
     return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir
 
 
 culink_options = [
+    LinkerOptions(),
     LinkerOptions(arch=ARCH, verbose=True),
     LinkerOptions(arch=ARCH, max_register_count=32),
     LinkerOptions(arch=ARCH, optimization_level=3),
     LinkerOptions(arch=ARCH, debug=True),
     LinkerOptions(arch=ARCH, lineinfo=True),
-    LinkerOptions(arch=ARCH, no_cache=True),
+    LinkerOptions(arch=ARCH, no_cache=True),  # TODO: consider adding cuda 12.4 to test matrix in which case this
+    # will fail. Tracked in issue #337
 ]
 
 
@@ -77,10 +85,15 @@ def test_linker_init(compile_ptx_functions, options):
     assert isinstance(object_code, ObjectCode)
 
 
-def test_linker_init_invalid_arch():
-    options = LinkerOptions(arch=None)
-    with pytest.raises(TypeError):
-        Linker(options)
+def test_linker_init_invalid_arch(compile_ptx_functions):
+    if culink_backend:
+        with pytest.raises(AttributeError):
+            options = LinkerOptions(arch="99", ptx=True)
+            Linker(*compile_ptx_functions, options=options)
+    else:
+        with pytest.raises(nvjitlink.nvJitLinkError):
+            options = LinkerOptions(arch="99", ptx=True)
+            Linker(*compile_ptx_functions, options=options)
 
 
 @pytest.mark.skipif(culink_backend, reason="culink does not support ptx option")
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 61933d41..7db017f1 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -10,13 +10,13 @@
 import pytest
 from conftest import can_load_generated_ptx
 
-from cuda.core.experimental import Program
+from cuda.core.experimental import Program, ProgramOptions
 
 
 @pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new")
 def test_get_kernel():
     kernel = """extern "C" __global__ void ABC() { }"""
-    object_code = Program(kernel, "c++").compile("ptx", options=("-rdc=true",))
+    object_code = Program(kernel, "c++", options=ProgramOptions(relocatable_device_code=True)).compile("ptx")
     assert object_code._handle is None
     kernel = object_code.get_kernel("ABC")
     assert object_code._handle is not None
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index cca01af5..0fbd5866 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -9,8 +9,43 @@
 import pytest
 from conftest import can_load_generated_ptx
 
-from cuda.core.experimental import Device, Program
 from cuda.core.experimental._module import Kernel, ObjectCode
+from cuda.core.experimental._program import Program, ProgramOptions
+
+
+# TODO handle and test options whcih are only supported on more modern CUDA versions
+# tracked in #337
+def test_program_with_various_options(init_cuda):
+    code = 'extern "C" __global__ void my_kernel() {}'
+
+    options_list = [
+        ProgramOptions(device_code_optimize=True, debug=True),
+        ProgramOptions(relocatable_device_code=True, max_register_count=32),
+        ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
+        ProgramOptions(fma=False, use_fast_math=True),
+        ProgramOptions(extra_device_vectorization=True),
+        ProgramOptions(link_time_optimization=True),
+        ProgramOptions(define_macro="MY_MACRO"),
+        ProgramOptions(define_macro=("MY_MACRO", "99")),
+        ProgramOptions(define_macro=[("MY_MACRO", "99")]),
+        ProgramOptions(define_macro=[("MY_MACRO", "99"), ("MY_OTHER_MACRO", "100")]),
+        ProgramOptions(undefine_macro=["MY_MACRO", "MY_OTHER_MACRO"]),
+        ProgramOptions(undefine_macro="MY_MACRO", include_path="/usr/local/include"),
+        ProgramOptions(builtin_initializer_list=False, disable_warnings=True),
+        ProgramOptions(restrict=True, device_as_default_execution_space=True),
+        ProgramOptions(device_int128=True, optimization_info="inline"),
+        ProgramOptions(no_display_error_number=True),
+        ProgramOptions(diag_error=1234, diag_suppress=1234),
+        ProgramOptions(diag_error=[1234, 1223], diag_suppress=(1234, 1223)),
+        ProgramOptions(diag_warn="1000"),
+    ]
+
+    for options in options_list:
+        program = Program(code, "c++", options)
+        assert program.backend == "nvrtc"
+        program.compile("ptx")
+        program.close()
+        assert program.handle is None
 
 
 def test_program_init_valid_code_type():
@@ -33,13 +68,12 @@ def test_program_init_invalid_code_format():
 
 
 # TODO: incorporate this check in Program
+# This is tested against the current device's arch
 @pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new")
 def test_program_compile_valid_target_type():
     code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")
-    arch = "".join(str(i) for i in Device().compute_capability)
-    object_code = program.compile("ptx", options=(f"-arch=compute_{arch}",))
-    print(object_code._module.decode())
+    object_code = program.compile("ptx")
     kernel = object_code.get_kernel("my_kernel")
     assert isinstance(object_code, ObjectCode)
     assert isinstance(kernel, Kernel)