diff --git a/tests/llm/accuracy_conformance.py b/tests/llm/accuracy_conformance.py
new file mode 100644
index 00000000000000..41015d7664ecc2
--- /dev/null
+++ b/tests/llm/accuracy_conformance.py
@@ -0,0 +1,116 @@
+import gc
+import logging
+import os
+import shutil
+import tempfile
+
+import pytest
+import whowhatbench as wwb
+from optimum.intel.openvino import (OVModelForCausalLM,
+                                    OVWeightQuantizationConfig)
+from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+MODEL_IDS = [
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "Qwen/Qwen2-0.5B-Instruct",
+]
+DEVICES = [
+    "CPU",
+    "GPU",
+]
+NUMBER_OF_SAMPLES = 15
+METRIC_OF_INTEREST = "similarity"
+
+REFERENCES = {
+    "llama": {"INT8": 0.95, "INT4": 0.95},
+    "qwen2": {"INT8": 0.77, "INT4": 0.77},
+}
+ACCURACY_THRESHOLDS = {
+    "INT8": 0.05,
+    "INT4": 0.05,
+}
+
+tmp_dir = tempfile.mkdtemp()
+
+
+def init_test_scope():
+    test_scope = []
+
+    for model_id in MODEL_IDS:
+        logger.info(f"Downloading and quantizing model: {model_id}")
+        model = AutoModelForCausalLM.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model_type = model.config.model_type
+        model_path = os.path.join(tmp_dir, model_type)
+        model.save_pretrained(model_path)
+        tokenizer.save_pretrained(model_path)
+
+        ov_model = OVModelForCausalLM.from_pretrained(model_path, load_in_8bit=True)
+        ov_model_path = model_path = os.path.join(tmp_dir, model_type + "_ov")
+        ov_model.save_pretrained(ov_model_path)
+        tokenizer.save_pretrained(ov_model_path)
+        del ov_model
+        gc.collect()
+
+        quantization_config = OVWeightQuantizationConfig(bits=4, ratio=0.5)
+        quantized_model = OVModelForCausalLM.from_pretrained(
+            model_path, quantization_config=quantization_config
+        )
+        quantized_model_path = model_path = os.path.join(
+            tmp_dir, model_type + "_ov_int4"
+        )
+        quantized_model.save_pretrained(quantized_model_path)
+        tokenizer.save_pretrained(quantized_model_path)
+        del quantized_model
+        gc.collect()
+
+        set_seed(42)
+        evaluator = wwb.Evaluator(
+            base_model=model, tokenizer=tokenizer, num_samples=NUMBER_OF_SAMPLES
+        )
+        gt_path = os.path.join(tmp_dir, model_type + "_gt.json")
+        evaluator.dump_gt(gt_path)
+        [
+            test_scope.append((ov_model_path, model_type, "INT8", gt_path, device))
+            for device in DEVICES
+        ]
+        [
+            test_scope.append((ov_model_path, model_type, "INT4", gt_path, device))
+            for device in DEVICES
+        ]
+
+    return test_scope
+
+
+def teardown_module():
+    logger.info("Remove models")
+    shutil.rmtree(tmp_dir)
+
+
+test_scope = init_test_scope()
+
+
+@pytest.mark.parametrize(
+    ("model_path", "model_type", "precision", "gt_data", "device"),
+    test_scope,
+)
+def test_accuracy_conformance(model_path, model_type, precision, gt_data, device):
+    target_model = OVModelForCausalLM.from_pretrained(model_path, device=device)
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+    evaluator = wwb.Evaluator(
+        base_model=None,
+        tokenizer=tokenizer,
+        gt_data=gt_data,
+        num_samples=NUMBER_OF_SAMPLES,
+    )
+
+    set_seed(42)
+    _, all_metrics = evaluator.score(target_model)
+    metric = all_metrics[METRIC_OF_INTEREST].values[0]
+    abs_metric_diff = abs(REFERENCES[model_type][precision] - metric)
+    print(metric, REFERENCES[model_type][precision], model_type, precision)
+    assert abs_metric_diff <= ACCURACY_THRESHOLDS[precision]
diff --git a/tests/llm/requirements.txt b/tests/llm/requirements.txt
new file mode 100644
index 00000000000000..2d1912890910f5
--- /dev/null
+++ b/tests/llm/requirements.txt
@@ -0,0 +1,6 @@
+-c ../constraints.txt
+--extra-index-url https://download.pytorch.org/whl/cpu
+optimum-intel
+nncf
+whowhatbench @ git+https://github.com/openvinotoolkit/openvino.genai.git#subdirectory=tools/who_what_benchmark
+pytest
\ No newline at end of file