diff --git a/backends/arm/scripts/parse_test_names.py b/backends/arm/scripts/parse_test_names.py index 1315358b40b..60a0d4140f0 100644 --- a/backends/arm/scripts/parse_test_names.py +++ b/backends/arm/scripts/parse_test_names.py @@ -38,7 +38,16 @@ ALL_EDGE_OPS = SAMPLE_INPUT.keys() | CUSTOM_EDGE_OPS # Add all targets and TOSA profiles we support here. -TARGETS = ["tosa_FP", "tosa_INT", "u55_INT", "u85_INT", "vgf_INT", "vgf_FP"] +TARGETS = [ + "tosa_FP", + "tosa_INT", + "u55_INT", + "u85_INT", + "vgf_INT", + "vgf_FP", + "vgf_quant", + "vgf_no_quant", +] def get_op_name_map(): diff --git a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py index 610ed554f4d..d41007e1e76 100644 --- a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py +++ b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py @@ -51,6 +51,9 @@ class TestCLIPTextModelWithProjection: "torch.ops.higher_order.executorch_call_delegate": 2, } + ops_after_partitioner_vgf_quantize = ops_after_partitioner_FP + ops_after_partitioner_vgf_no_quantize = ops_after_partitioner_FP + def _prepare_inputs( self, batch_size=12, @@ -119,7 +122,7 @@ def test_CLIPTextModelWithProjection_tosa_INT(): @common.SkipIfNoModelConverter -def test_CLIPTextModelWithProjection_vgf_FP(): +def test_CLIPTextModelWithProjection_vgf_no_quant(): text_encoder_model, text_encoder_model_inputs = ( TestCLIPTextModelWithProjection().prepare_model_and_inputs() ) @@ -129,23 +132,24 @@ def test_CLIPTextModelWithProjection_vgf_FP(): text_encoder_model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, - atol=4, # TODO: Investiage numerical issue: MAX Diff ~50% + atol=4, transform_passes=[ ConvertInt64ConstOpsToInt32Pass(), ConvertInt64OutputOpsToInt32Pass(), InsertInt32CastsAfterInt64PlaceholdersPass(), ], + quantize=False, ) pipeline.change_args( - "check_count.exir", TestCLIPTextModelWithProjection.ops_after_partitioner_FP + "check_count.exir", + TestCLIPTextModelWithProjection.ops_after_partitioner_vgf_no_quantize, ) pipeline.run() @common.SkipIfNoModelConverter -def test_CLIPTextModelWithProjection_vgf_INT(): +def test_CLIPTextModelWithProjection_vgf_quant(): text_encoder_model, text_encoder_model_inputs = ( TestCLIPTextModelWithProjection().prepare_model_and_inputs() ) @@ -155,12 +159,12 @@ def test_CLIPTextModelWithProjection_vgf_INT(): text_encoder_model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, atol=0.8, + quantize=True, ) pipeline.change_args( "check_count.exir", - TestCLIPTextModelWithProjection.ops_after_partitioner_INT, + TestCLIPTextModelWithProjection.ops_after_partitioner_vgf_quantize, ) pipeline.run() diff --git a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py index 6444b8417f2..765ad3f225c 100644 --- a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py +++ b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py @@ -43,6 +43,9 @@ class TestSD3Transformer2DModel: "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1, } + ops_after_partitioner_vgf_quantize = ops_after_partitioner_FP + ops_after_partitioner_vgf_no_quantize = ops_after_partitioner_FP + def _prepare_inputs( self, batch_size=2, @@ -141,7 +144,7 @@ def test_SD3Transformer2DModel_tosa_INT(): @common.SkipIfNoModelConverter -def test_SD3Transformer2DModel_vgf_FP(): +def test_SD3Transformer2DModel_vgf_no_quant(): sd35_transformer2D_model, sd35_transformer2D_model_inputs = ( TestSD3Transformer2DModel().prepare_model_and_inputs() ) @@ -151,19 +154,20 @@ def test_SD3Transformer2DModel_vgf_FP(): sd35_transformer2D_model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, - rtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT + rtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT, atol=4.0, + quantize=False, ) pipeline.change_args( - "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_FP + "check_count.exir", + TestSD3Transformer2DModel.ops_after_partitioner_vgf_no_quantize, ) pipeline.run() @common.SkipIfNoModelConverter -def test_SD3Transformer2DModel_vgf_INT(): +def test_SD3Transformer2DModel_vgf_quant(): sd35_transformer2D_model, sd35_transformer2D_model_inputs = ( TestSD3Transformer2DModel().prepare_model_and_inputs() ) @@ -173,13 +177,14 @@ def test_SD3Transformer2DModel_vgf_INT(): sd35_transformer2D_model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, - qtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT - rtol=1.0, + qtol=1.0, + rtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT, atol=4.0, + quantize=True, ) pipeline.change_args( - "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_INT + "check_count.exir", + TestSD3Transformer2DModel.ops_after_partitioner_vgf_quantize, ) pipeline.run() diff --git a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py index 20b92e4a258..7ab7f86f449 100644 --- a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py +++ b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py @@ -44,6 +44,13 @@ class TestT5EncoderModel: "torch.ops.higher_order.executorch_call_delegate": 3, } + ops_after_partitioner_vgf_quantize = { + "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 1, + "torch.ops.higher_order.executorch_call_delegate": 1, + } + + ops_after_partitioner_vgf_no_quantize = ops_after_partitioner_vgf_quantize + def _prepare_inputs( self, batch_size=12, @@ -110,7 +117,7 @@ def test_T5EncoderModel_tosa_INT(): @common.SkipIfNoModelConverter -def test_T5EncoderModel_vgf_FP(): +def test_T5EncoderModel_vgf_no_quant(): t5_encoder_model, t5_encoder_model_inputs = ( TestT5EncoderModel().prepare_model_and_inputs() ) @@ -120,22 +127,22 @@ def test_T5EncoderModel_vgf_FP(): t5_encoder_model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, transform_passes=[ ConvertInt64ConstOpsToInt32Pass(), ConvertInt64OutputOpsToInt32Pass(), InsertInt32CastsAfterInt64PlaceholdersPass(), ], + quantize=False, ) pipeline.change_args( - "check_count.exir", TestT5EncoderModel.ops_after_partitioner_FP + "check_count.exir", TestT5EncoderModel.ops_after_partitioner_vgf_no_quantize ) pipeline.run() @common.SkipIfNoModelConverter -def test_T5EncoderModel_vgf_INT(): +def test_T5EncoderModel_vgf_quant(): t5_encoder_model, t5_encoder_model_inputs = ( TestT5EncoderModel().prepare_model_and_inputs() ) @@ -145,10 +152,10 @@ def test_T5EncoderModel_vgf_INT(): t5_encoder_model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, + quantize=True, ) pipeline.change_args( - "check_count.exir", TestT5EncoderModel.ops_after_partitioner_INT + "check_count.exir", TestT5EncoderModel.ops_after_partitioner_vgf_quantize ) pipeline.run() diff --git a/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py b/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py index 5d33576a817..cb5f93f55d8 100644 --- a/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py +++ b/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py @@ -84,7 +84,7 @@ def test_AutoencoderKL_tosa_INT(): @common.SkipIfNoModelConverter -def test_AutoencoderKL_vgf_FP(): +def test_AutoencoderKL_vgf_no_quant(): auto_encoder_model, auto_encoder_model_inputs = ( TestAutoencoderKL().prepare_model_and_inputs() ) @@ -94,14 +94,14 @@ def test_AutoencoderKL_vgf_FP(): auto_encoder_model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, + quantize=False, ) pipeline.run() @common.SkipIfNoModelConverter -def test_AutoencoderKL_vgf_INT(): +def test_AutoencoderKL_vgf_quant(): auto_encoder_model, auto_encoder_model_inputs = ( TestAutoencoderKL().prepare_model_and_inputs() ) @@ -111,8 +111,8 @@ def test_AutoencoderKL_vgf_INT(): auto_encoder_model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, atol=1.0, # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT + quantize=True, ) pipeline.run() diff --git a/backends/arm/test/models/test_conformer.py b/backends/arm/test/models/test_conformer.py index f5a4c8c5053..85ac2733e70 100644 --- a/backends/arm/test/models/test_conformer.py +++ b/backends/arm/test/models/test_conformer.py @@ -130,14 +130,14 @@ def test_conformer_u85_INT(): @common.SkipIfNoModelConverter -def test_conformer_vgf_INT(): +def test_conformer_vgf_quant(): pipeline = VgfPipeline[input_t]( TestConformer.conformer, TestConformer.model_example_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, + quantize=True, ) pipeline.pop_stage("check_count.exir") pipeline.change_args( @@ -152,13 +152,13 @@ def test_conformer_vgf_INT(): @common.SkipIfNoModelConverter -def test_conformer_vgf_FP(): +def test_conformer_vgf_no_quant(): pipeline = VgfPipeline[input_t]( TestConformer.conformer, TestConformer.model_example_inputs, aten_op=TestConformer.aten_ops, exir_op=[], - tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, + quantize=False, ) pipeline.run() diff --git a/backends/arm/test/models/test_deit_tiny_arm.py b/backends/arm/test/models/test_deit_tiny_arm.py index 48a2072fdd8..c53ab4fa0a9 100644 --- a/backends/arm/test/models/test_deit_tiny_arm.py +++ b/backends/arm/test/models/test_deit_tiny_arm.py @@ -93,28 +93,28 @@ def test_deit_tiny_u85_INT(): @common.SkipIfNoModelConverter -def test_deit_tiny_vgf_INT(): +def test_deit_tiny_vgf_quant(): pipeline = VgfPipeline[input_t]( deit_tiny, model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, atol=1.5, qtol=1, + quantize=True, ) pipeline.run() @common.SkipIfNoModelConverter -def test_deit_tiny_vgf_FP(): +def test_deit_tiny_vgf_no_quant(): pipeline = VgfPipeline[input_t]( deit_tiny, model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, + quantize=False, ) pipeline.run() diff --git a/backends/arm/test/models/test_dl3_arm.py b/backends/arm/test/models/test_dl3_arm.py index 4c4e70e6f35..8e10001d755 100644 --- a/backends/arm/test/models/test_dl3_arm.py +++ b/backends/arm/test/models/test_dl3_arm.py @@ -89,15 +89,15 @@ def test_dl3_u85_INT(): @common.SkipIfNoModelConverter -def test_dl3_vgf_INT(): +def test_dl3_vgf_quant(): pipeline = VgfPipeline[input_t]( TestDl3.dl3, TestDl3.model_example_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, - run_on_vulkan_runtime=True, # TODO: run on vulkan runtime + run_on_vulkan_runtime=True, + quantize=True, ) pipeline.change_args( "run_method_and_compare_outputs", rtol=0.1, atol=0.1 @@ -106,13 +106,13 @@ def test_dl3_vgf_INT(): @common.SkipIfNoModelConverter -def test_dl3_vgf_FP(): +def test_dl3_vgf_no_quant(): pipeline = VgfPipeline[input_t]( TestDl3.dl3, TestDl3.model_example_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, + quantize=False, ) pipeline.run() diff --git a/backends/arm/test/models/test_inception_v3_arm.py b/backends/arm/test/models/test_inception_v3_arm.py index 13dfac3199f..0614ca23036 100644 --- a/backends/arm/test/models/test_inception_v3_arm.py +++ b/backends/arm/test/models/test_inception_v3_arm.py @@ -93,14 +93,14 @@ def test_ic3_u85_BI(): @pytest.mark.slow @pytest.mark.skip(reason="Takes too long to run on CI") @common.SkipIfNoModelConverter -def test_ic3_vgf_FP(): +def test_ic3_vgf_no_quant(): pipeline = VgfPipeline[input_t]( ic3, model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, + quantize=False, ) pipeline.run() @@ -108,13 +108,13 @@ def test_ic3_vgf_FP(): @pytest.mark.slow @pytest.mark.skip(reason="Takes too long to run on CI") @common.SkipIfNoModelConverter -def test_ic3_vgf_INT(): +def test_ic3_vgf_quant(): pipeline = VgfPipeline[input_t]( ic3, model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, + quantize=True, ) pipeline.run() diff --git a/backends/arm/test/models/test_llama.py b/backends/arm/test/models/test_llama.py index 937dbf93674..a318afbfc1c 100644 --- a/backends/arm/test/models/test_llama.py +++ b/backends/arm/test/models/test_llama.py @@ -141,7 +141,7 @@ def test_llama_tosa_INT(): @common.SkipIfNoModelConverter -def test_llama_vgf_FP(): +def test_llama_vgf_no_quant(): llama_model, llama_inputs, llama_meta = TestLlama().prepare_model() if llama_model is None or llama_inputs is None: @@ -153,16 +153,16 @@ def test_llama_vgf_FP(): llama_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, transform_passes=[InsertInt32CastsAfterInt64PlaceholdersPass()], run_on_vulkan_runtime=True, + quantize=False, ) pipeline.run() @common.SkipIfNoModelConverter -def test_llama_vgf_INT(): +def test_llama_vgf_quant(): llama_model, llama_inputs, llama_meta = TestLlama().prepare_model() if llama_model is None or llama_inputs is None: @@ -174,8 +174,8 @@ def test_llama_vgf_INT(): llama_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, run_on_vulkan_runtime=True, + quantize=True, ) pipeline.run() diff --git a/backends/arm/test/models/test_lstm_arm.py b/backends/arm/test/models/test_lstm_arm.py index 8ee236822e8..e9af67c13ea 100644 --- a/backends/arm/test/models/test_lstm_arm.py +++ b/backends/arm/test/models/test_lstm_arm.py @@ -115,27 +115,27 @@ def test_lstm_u85_INT(): @common.SkipIfNoModelConverter -def test_lstm_vgf_INT(): +def test_lstm_vgf_quant(): pipeline = VgfPipeline[input_t]( TestLSTM.lstm, TestLSTM.model_example_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, + quantize=True, ) pipeline.run() @common.SkipIfNoModelConverter -def test_lstm_vgf_FP(): +def test_lstm_vgf_no_quant(): pipeline = VgfPipeline[input_t]( TestLSTM.lstm, TestLSTM.model_example_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, + quantize=False, ) pipeline.run() diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py index 460e27164be..41ef4136760 100644 --- a/backends/arm/test/models/test_mobilenet_v2_arm.py +++ b/backends/arm/test/models/test_mobilenet_v2_arm.py @@ -114,29 +114,29 @@ def test_mv2_u85_INT(per_channel_quantization): @common.SkipIfNoModelConverter @common.parametrize("per_channel_quantization", quant_test_data) -def test_mv2_vgf_INT(per_channel_quantization): +def test_mv2_vgf_quant(per_channel_quantization): pipeline = VgfPipeline[input_t]( mv2, model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, per_channel_quantization=per_channel_quantization, atol=0.25, qtol=1, + quantize=True, ) pipeline.run() @common.SkipIfNoModelConverter -def test_mv2_vgf_FP(): +def test_mv2_vgf_no_quant(): pipeline = VgfPipeline[input_t]( mv2, model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, + quantize=False, ) pipeline.run() diff --git a/backends/arm/test/models/test_mobilenet_v3_arm.py b/backends/arm/test/models/test_mobilenet_v3_arm.py index 0a9c5ba27fc..d17fc48f0e4 100644 --- a/backends/arm/test/models/test_mobilenet_v3_arm.py +++ b/backends/arm/test/models/test_mobilenet_v3_arm.py @@ -86,28 +86,28 @@ def test_mv3_u85_INT(): @common.SkipIfNoModelConverter @pytest.mark.slow -def test_mv3_vgf_INT(): +def test_mv3_vgf_quant(): pipeline = VgfPipeline[input_t]( mv3, model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, atol=0.5, qtol=1, + quantize=True, ) pipeline.run() @common.SkipIfNoModelConverter -def test_mv3_vgf_FP(): +def test_mv3_vgf_no_quant(): pipeline = VgfPipeline[input_t]( mv3, model_inputs, aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, + quantize=False, ) pipeline.run() diff --git a/backends/arm/test/models/test_nss.py b/backends/arm/test/models/test_nss.py index 12b72c328d5..aada338b9c6 100644 --- a/backends/arm/test/models/test_nss.py +++ b/backends/arm/test/models/test_nss.py @@ -110,30 +110,30 @@ def test_nss_u85_INT(): reason="[MLETORCH-1430]: Double types are not supported in buffers in MSL" ) @common.SkipIfNoModelConverter -def test_nss_vgf_FP(): +def test_nss_vgf_no_quant(): pipeline = VgfPipeline[input_t]( nss().eval(), example_inputs(), aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, run_on_vulkan_runtime=True, + quantize=False, ) pipeline.run() @common.SkipIfNoModelConverter -def test_nss_vgf_INT(): +def test_nss_vgf_quant(): pipeline = VgfPipeline[input_t]( nss().eval(), example_inputs(), aten_op=[], exir_op=[], - tosa_version="TOSA-1.0+INT", symmetric_io_quantization=True, use_to_edge_transform_and_lower=True, run_on_vulkan_runtime=True, + quantize=True, ) pipeline.run() diff --git a/backends/arm/test/models/test_w2l_arm.py b/backends/arm/test/models/test_w2l_arm.py index 0eda5f45875..91e7732c161 100644 --- a/backends/arm/test/models/test_w2l_arm.py +++ b/backends/arm/test/models/test_w2l_arm.py @@ -111,26 +111,26 @@ def test_w2l_u85_INT(): @common.SkipIfNoModelConverter @pytest.mark.slow -def test_w2l_vgf_INT(): +def test_w2l_vgf_quant(): pipeline = VgfPipeline[input_t]( TestW2L.create_model(), TestW2L.model_example_inputs, aten_op=[], exir_op=TestW2L.all_operators, - tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, + quantize=True, ) pipeline.run() @common.SkipIfNoModelConverter -def test_w2l_vgf_FP(): +def test_w2l_vgf_no_quant(): pipeline = VgfPipeline[input_t]( TestW2L.create_model(), TestW2L.model_example_inputs, aten_op=[], exir_op=TestW2L.all_operators, - tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, + quantize=False, ) pipeline.run() diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py index 5df9668a540..751c7fe4caf 100644 --- a/backends/arm/test/tester/test_pipeline.py +++ b/backends/arm/test/tester/test_pipeline.py @@ -1049,6 +1049,7 @@ def __init__( run_on_vulkan_runtime: bool = True, vgf_compiler_flags: Optional[str] = "", tosa_version: str = "TOSA-1.0+INT+FP", + quantize: bool = True, symmetric_io_quantization: bool = False, per_channel_quantization: bool = True, use_to_edge_transform_and_lower: bool = True, @@ -1087,7 +1088,7 @@ def __init__( transform_passes=transform_passes, ) - if tosa_spec.support_integer(): + if quantize and tosa_spec.support_integer(): quantizer = VgfQuantizer(compile_spec) quantization_config = get_symmetric_quantization_config( is_per_channel=per_channel_quantization